def write_header_comment_file(self): if self.header_written: return comment_file = 'comment_' + self.outfile wrtr_file = open(comment_file,'ab') wrtr = UnicodeWriter(wrtr_file) wrtr.writerow([u'Language', u'Url', u'Published', u'Country', u'ThreadId', u'Inserted', u'PostSize', u'Subject', u'Text']) wrtr_file.close() self.header_written = True
def __init__(self, key, outfile, filter_thread): self.url = 'http://api.boardreader.com/v1/Blogs/Thread' self.key = key self.outfile = outfile self.filter_thread = filter_thread self.response = requests.get(self.url, {'key': self.key, 'rt': 'json', 'filter_thread' :self.filter_thread}) wrtr_file = open(self.outfile,'ab') wrtr = UnicodeWriter(wrtr_file) for item in self.response.json()['response']['Matches']['Match']: text = re.sub('[\,\n\t\b-]','',item['Text']) text = text.replace(',','') wrtr.writerow([item[u'Language'], item[u'Url'], item[u'Published'], item['Country'], item[u'ThreadId'], item[u'Inserted'], str(item['PostSize']), item['Subject'].replace(',',''), text]) wrtr_file.close()
def save_response(self): while self.query_limit > 0: self.filter_inserted_to = int(time.mktime(time.strptime(self.response['Matches']['Match'][-1]['Published'],"%Y-%m-%d %H:%M:%S"))) self.response['Matches']['Match'].extend(self.get_response()['Matches']['Match']) wrtr_file = open(self.outfile,'wb') wrtr = UnicodeWriter(wrtr_file) wrtr.writerow([u'Language', u'Url', u'Published', u'Country', u'ThreadId', u'Inserted', u'PostSize', u'Subject', u'Text']) for item in self.response['Matches']['Match']: text = re.sub('[\,\n\t\b-]','',item['Text']) text = text.replace(',','') wrtr.writerow([item[u'Language'], item[u'Url'], item[u'Published'], item['Country'], item[u'ThreadId'], item[u'Inserted'], str(item['PostSize']), item['Subject'].replace(',',''), text]) wrtr_file.close() for item in self.response['Matches']['Match']: if item[u'CommentsInThread'] > 0: self.write_header_comment_file() Comment(key=self.key, outfile='comment_' + self.outfile, filter_thread = item[u'ThreadId'])
if phone_info is None: continue phone = phone_info.text address = info.find('address').find('span').text wrtr.writerow([city,subcategory,name,phone,address]) #print name, phone, address, city, subcategory if data.find('ul','pager'): if data.find('ul','pager').find('li','next'): page = page + 1 writerecord(base_url, subcategory,city, page) if __name__ == '__main__': base_url = 'http://yellowpages.sulekha.com/clothing-accessories_delhi_clistings' cities = ['chennai'] outfile = open('chennai_sulekha1.csv','wb') wrtr = UnicodeWriter(outfile,delimiter=';') wrtr.writerow(['City','Category','Name','Phone','Address']) for city in cities: url = base_url.replace('delhi',city) next_iter = True links = BeautifulSoup(urlopen(url),'html.parser') for line in links.find('ol','business-clisting').findAll('div','blockTitle'): new_url = line.find('a')['href'] if new_url == 'http://yellowpages.sulekha.com/tie-manufacturers_chennai': next_iter = False if next_iter: continue incity = '_' + city subcategory = new_url.split('/')[-1].replace('-',' ').replace(incity,'') writerecord(new_url,subcategory,city) outfile.close()
data = BeautifulSoup(urlopen(url),"html.parser") #print url for item in data.findAll('div','card'): if (item.find('div','name') == None): break name = item.find('div','name').text.strip() place = item.find('div','place').text.strip() phone = item.find('a','mob-link').text.strip() if len(phone) == 0: continue wrtr.writerow([category,name,place,phone]) def locations(city_file): in_file = open(city_file) reader = csv.reader(in_file) for line in reader: yield (line[0], line[1]) if __name__ == '__main__': base_url = 'https://www.askme.com/search?q=' outfile = open('askme_mumbai.csv','w') categories = map(lambda x: x.strip(),open('categories').readlines()) wrtr = UnicodeWriter(outfile) wrtr.writerow(['category','name','address','phone']) for locality,city in locations('mumbai_localities.csv'): loc = locality.replace('.','').replace('-','').replace(' ','+') for category in categories: cat = category.strip().replace(' ','+') + '+' url = base_url+cat+'in+'+loc+'&type=outlets&city='+city writerecord(url,wrtr,category) outfile.close()