Beispiel #1
0
 def write_header_comment_file(self):
     if self.header_written: return
     comment_file = 'comment_' + self.outfile
     wrtr_file = open(comment_file,'ab')
     wrtr = UnicodeWriter(wrtr_file)
     wrtr.writerow([u'Language', u'Url', u'Published', u'Country', u'ThreadId', u'Inserted', u'PostSize',
     u'Subject', u'Text'])
     wrtr_file.close()
     self.header_written = True
Beispiel #2
0
 def __init__(self, key, outfile, filter_thread):
     self.url = 'http://api.boardreader.com/v1/Blogs/Thread'
     self.key = key
     self.outfile = outfile
     self.filter_thread = filter_thread
     self.response  = requests.get(self.url, {'key': self.key, 'rt': 'json', 'filter_thread' :self.filter_thread})
     wrtr_file = open(self.outfile,'ab')
     wrtr = UnicodeWriter(wrtr_file)
     for item in self.response.json()['response']['Matches']['Match']:
         text = re.sub('[\,\n\t\b-]','',item['Text'])
         text = text.replace(',','')
         wrtr.writerow([item[u'Language'], item[u'Url'], item[u'Published'], item['Country'],
             item[u'ThreadId'], item[u'Inserted'], str(item['PostSize']), item['Subject'].replace(',',''), text])
     wrtr_file.close()
Beispiel #3
0
 def save_response(self):
     while self.query_limit > 0:
         self.filter_inserted_to = int(time.mktime(time.strptime(self.response['Matches']['Match'][-1]['Published'],"%Y-%m-%d %H:%M:%S")))
         self.response['Matches']['Match'].extend(self.get_response()['Matches']['Match'])
     wrtr_file = open(self.outfile,'wb')
     wrtr = UnicodeWriter(wrtr_file)
     wrtr.writerow([u'Language', u'Url', u'Published', u'Country', u'ThreadId', u'Inserted', u'PostSize',
     u'Subject', u'Text'])
     for item in self.response['Matches']['Match']:
         text = re.sub('[\,\n\t\b-]','',item['Text'])
         text = text.replace(',','')
         wrtr.writerow([item[u'Language'], item[u'Url'], item[u'Published'], item['Country'],
             item[u'ThreadId'], item[u'Inserted'], str(item['PostSize']), item['Subject'].replace(',',''), text])
     wrtr_file.close()
     for item in self.response['Matches']['Match']:
         if item[u'CommentsInThread'] > 0:
             self.write_header_comment_file()
             Comment(key=self.key, outfile='comment_' + self.outfile, filter_thread = item[u'ThreadId'])
Beispiel #4
0
        if phone_info is None: continue
        phone = phone_info.text
        address = info.find('address').find('span').text
        wrtr.writerow([city,subcategory,name,phone,address])
        #print name, phone, address, city, subcategory
    if data.find('ul','pager'):
        if data.find('ul','pager').find('li','next'):
            page = page + 1
            writerecord(base_url, subcategory,city, page)


if __name__ == '__main__':
    base_url = 'http://yellowpages.sulekha.com/clothing-accessories_delhi_clistings'
    cities = ['chennai']
    outfile = open('chennai_sulekha1.csv','wb')
    wrtr = UnicodeWriter(outfile,delimiter=';')
    wrtr.writerow(['City','Category','Name','Phone','Address'])
    for city in cities:
        url = base_url.replace('delhi',city)
        next_iter = True
        links = BeautifulSoup(urlopen(url),'html.parser')
        for line in links.find('ol','business-clisting').findAll('div','blockTitle'):
            new_url = line.find('a')['href']
            if new_url == 'http://yellowpages.sulekha.com/tie-manufacturers_chennai':
                next_iter = False
            if next_iter: continue
            incity = '_' + city
            subcategory = new_url.split('/')[-1].replace('-',' ').replace(incity,'')
            writerecord(new_url,subcategory,city)
    outfile.close()
Beispiel #5
0
    data = BeautifulSoup(urlopen(url),"html.parser")
    #print url
    for item in data.findAll('div','card'):
        if (item.find('div','name') == None): break
        name = item.find('div','name').text.strip()
        place = item.find('div','place').text.strip()
        phone = item.find('a','mob-link').text.strip()
        if len(phone) == 0: continue
        wrtr.writerow([category,name,place,phone])

def locations(city_file):
    in_file = open(city_file)
    reader = csv.reader(in_file)
    for line in reader:
        yield (line[0], line[1])


if __name__ == '__main__':
    base_url = 'https://www.askme.com/search?q='
    outfile = open('askme_mumbai.csv','w')
    categories = map(lambda x: x.strip(),open('categories').readlines())
    wrtr = UnicodeWriter(outfile)
    wrtr.writerow(['category','name','address','phone'])
    for locality,city in locations('mumbai_localities.csv'):
        loc = locality.replace('.','').replace('-','').replace(' ','+')
        for category in categories:
            cat = category.strip().replace(' ','+') + '+'
            url = base_url+cat+'in+'+loc+'&type=outlets&city='+city
            writerecord(url,wrtr,category)
    outfile.close()