def s_thing_widget(soup, page_dict):# more example would come back later lists = soup.findAll('div', attrs={'id':'facebook_share_button'}) print len(lists) url_widget = lists[0].contents[1]['src'] response, content = fetch.fetchio_single(url_widget) if True: #print content soup_widget = BeautifulSoup(content) lists_widget = soup_widget.findAll('div', attrs={'class':'connect_widget_button_count_count'}) widget_count = lists_widget[1].contents[0] print widget_count
def s_thing_widget(soup, page_dict): # more example would come back later lists = soup.findAll('div', attrs={'id': 'facebook_share_button'}) print len(lists) url_widget = lists[0].contents[1]['src'] response, content = fetch.fetchio_single(url_widget) if True: #print content soup_widget = BeautifulSoup(content) lists_widget = soup_widget.findAll( 'div', attrs={'class': 'connect_widget_button_count_count'}) widget_count = lists_widget[1].contents[0] print widget_count
def people_check(url): sql_id = 'SELECT id FROM people WHERE url=?' c = conn.cursor() c.execute(sql_id, (url,)) people_id = c.fetchone() #print "***", people_id #print "***", people_id[0] if people_id: return people_id[0] else: name = '' register_time = '' response, content = fetch.fetchio_single(url) #print content if True: #response, content = http.request(url, 'GET') #if int(response['status']) == 200: soup = BeautifulSoup(content) lists = soup.findAll('div', attrs={'id':'user-meta'}) if lists: #print lists[0].contents name = lists[0].contents[1].contents[0] #print name #print lists[0].contents[3].contents #print "======" for l in lists[0].contents[3].contents: if type(l) == Tag: for r in l.contents: if type(r) == Tag: #print r.contents temp = r.contents[0].strip('\n\t') #print temp t = re.findall('Registered on', temp) if t: register_time = temp.strip('Registered on') #print register_time break #tag_number = tag_number.strip('()') #print t #print "****" #print "======" else: print "user's profile is not existing:"+url sql = 'INSERT INTO people (name, url, register_time) VALUES (?, ?, ?)' param = (name, url, register_time, ) #print param c.execute(sql, param) conn.commit() c.execute(sql_id, (url,)) people_id = c.fetchone() if people_id: return people_id[0] c.close()
def people_check(url): sql_id = "SELECT id FROM people WHERE url=?" c = conn.cursor() c.execute(sql_id, (url,)) people_id = c.fetchone() # print "***", people_id # print "***", people_id[0] if people_id: return people_id[0] else: name = "" register_time = "" response, content = fetch.fetchio_single(url) # print content if True: # response, content = http.request(url, 'GET') # if int(response['status']) == 200: soup = BeautifulSoup(content) lists = soup.findAll("div", attrs={"id": "user-meta"}) if lists: # print lists[0].contents name = lists[0].contents[1].contents[0] # print name # print lists[0].contents[3].contents # print "======" for l in lists[0].contents[3].contents: if type(l) == Tag: for r in l.contents: if type(r) == Tag: # print r.contents temp = r.contents[0].strip("\n\t") # print temp t = re.findall("Registered on", temp) if t: register_time = temp.strip("Registered on") # print register_time break # tag_number = tag_number.strip('()') # print t # print "****" # print "======" else: print "user's profile is not existing:" + url sql = "INSERT INTO people (name, url, register_time) VALUES (?, ?, ?)" param = (name, url, register_time) # print param c.execute(sql, param) conn.commit() c.execute(sql_id, (url,)) people_id = c.fetchone() if people_id: return people_id[0] c.close()
def s_comments(soup, index, page_dict): #lists = soup.findAll('div', attrs={'id':'thing-comments'}) #print len(lists) #print lists #http://js-kit.com/comments-data.js?ref=http%3A%2F%2Fwww.thingiverse.com%2Fthing%3A17773&p[0]=%2Fthing%3A17773&permalink[0]=http%3A%2F%2Fwww.thingiverse.com%2Fthing%3A17773 #http://js-kit.com/comments-data.js?ref=http%3A%2F%2Fwww.thingiverse.com%2Fthing%3A17773&p[0]=%2Fthing%3A17773&permalink[0]=http%3A%2F%2Fwww.thingiverse.com%2Fthing%3A17773&jx[0]=0 #http://js-kit.com/comments-data.js?ref=http%3A%2F%2Fwww.thingiverse.com%2Fthing%3A17773&p[0]=%2Fthing%3A17773&jx[0]=0 #http://js-kit.com/comments-data.js?ref=http://www.thingiverse.com/thing:17773&p[0]=/thing:17773&jx[0]=0 url = 'http://js-kit.com/comments-data.js?ref=http://www.thingiverse.com/thing:'+str(index)+'&p[0]=/thing:'+str(index)+'&jx[0]=0' print url response, content = fetch.fetchio_single(url) print response print content if int(response['status']) == 200: print content
def s_comments(soup, index, page_dict): #lists = soup.findAll('div', attrs={'id':'thing-comments'}) #print len(lists) #print lists #http://js-kit.com/comments-data.js?ref=http%3A%2F%2Fwww.thingiverse.com%2Fthing%3A17773&p[0]=%2Fthing%3A17773&permalink[0]=http%3A%2F%2Fwww.thingiverse.com%2Fthing%3A17773 #http://js-kit.com/comments-data.js?ref=http%3A%2F%2Fwww.thingiverse.com%2Fthing%3A17773&p[0]=%2Fthing%3A17773&permalink[0]=http%3A%2F%2Fwww.thingiverse.com%2Fthing%3A17773&jx[0]=0 #http://js-kit.com/comments-data.js?ref=http%3A%2F%2Fwww.thingiverse.com%2Fthing%3A17773&p[0]=%2Fthing%3A17773&jx[0]=0 #http://js-kit.com/comments-data.js?ref=http://www.thingiverse.com/thing:17773&p[0]=/thing:17773&jx[0]=0 url = 'http://js-kit.com/comments-data.js?ref=http://www.thingiverse.com/thing:' + str( index) + '&p[0]=/thing:' + str(index) + '&jx[0]=0' print url response, content = fetch.fetchio_single(url) print response print content if int(response['status']) == 200: print content
def s_made(soup, page_dict): #lists = soup.findAll('form', attrs={'id':'i_made_one_form'}) lists = soup.findAll('div', attrs={'id':'thing-made'}) #print len(lists) #print lists if lists: mades = [] deriveds = [] for l in lists: if l.contents[1].contents[0] == "Who's Made It?": try: if l.contents[5].contents[0]: made_count = l.contents[5].contents[0].strip('\n\tView all copies') #print l.contents[5].contents #print made_count made_count = int(made_count) if made_count >= 20: made_lists_url = l.contents[5]['href'] made_lists_url = page_url_root+made_lists_url response, content = fetch.fetchio_single(made_lists_url) soup_lists = BeautifulSoup(content) page_lists = soup_lists.findAll('div', attrs={'class':'pagination'}) page_lists = page_lists[0].contents[1].contents page_last = page_lists[len(page_lists)-2] page_last = page_last.contents[0]['href'].strip() page_last = re.findall('/page:\d+', page_last) if page_last: page_last = page_last[0].strip('/page:') page_last = int(page_last) #print page_last page_first = 1 for page_url in range(page_first, page_last+1, 1): page_url = made_lists_url+"/page:"+str(page_url) print page_url response, content = fetch.fetchio_single(page_url) soup_made = BeautifulSoup(content) made_lists = soup_made.findAll('div', attrs={'class':'things'}) if len(made_lists) == 0: #print "hello:::::::::::::::::::", made_lists continue for li in made_lists[0].contents: if type(li) == Tag: #print ".........." #print li.contents if not li.contents: #print '+++++++++++++++' break made_href = li.contents[1].contents[1]['href'] #print made_href made_href = page_url_root+made_href mades.append({pdl.made_url:made_href}) else: lists_made = l.contents[3].contents for lm in lists_made: if type(lm) == Tag: #print lm made_href = lm.contents[1]['href'] mades.append({pdl.made_url:made_href}) #print made_href #print "**********" except Exception, what: print what continue if l.contents[1].contents[0] == "Who's Derived It?": try: if l.contents[5].contents[0]: derivation_count = l.contents[5].contents[0].strip('\n\tView all variations') derivation_count = int(derivation_count) if derivation_count >= 20: #print derivation_count derivation_lists_url = l.contents[5]['href'] #url_root = 'http://www.thingiverse.com' derivation_lists_url = page_url_root+derivation_lists_url response, content = fetch.fetchio_single(derivation_lists_url) soup_lists = BeautifulSoup(content) page_lists = soup_lists.findAll('div', attrs={'class':'pagination'}) page_lists = page_lists[0].contents[1].contents page_last = page_lists[len(page_lists)-2] page_last = page_last.contents[0]['href'].strip() page_last = re.findall('/page:\d+', page_last) if page_last: page_last = page_last[0].strip('/page:') page_last = int(page_last) #print page_last page_first = 1 for page_url in range(page_first, page_last+1, 1): page_url = derivation_lists_url+"/page:"+str(page_url) print page_url response, content = fetch.fetchio_single(page_url) if int(response['status']) != 200: print 'each derivation page is not available' soup_derived = BeautifulSoup(content) derived_lists = soup_derived.findAll('div', attrs={'class':'things'}) if len(derived_lists) == 0: continue for li in derived_lists[0].contents: if type(li) == Tag: #print ".........." #print li if not li.contents: #print '+++++++++++++++' break derived_href = li.contents[1].contents[1]['href'] #print derived_href deriveds.append({pdl.derived_url:derived_href}) #print "*****************" #sys.exit() #print "finish" else: lists_derived = l.contents[3].contents for lm in lists_derived: if type(lm) == Tag: #print lm derived_href = lm.contents[1]['href'] deriveds.append({pdl.derived_url:derived_href}) #print derived_href #print "***==========" except Exception, what: print what continue
lists_derived = l.contents[3].contents for lm in lists_derived: if type(lm) == Tag: #print lm derived_href = lm.contents[1]['href'] deriveds.append({pdl.derived_url:derived_href}) #print derived_href #print "***==========" except Exception, what: print what continue if len(mades) > 0: mades_author_urls = [] for m in mades: made_url = m[pdl.made_url] response, content = fetch.fetchio_single(made_url) soup_made_author_url = BeautifulSoup(content) lists = soup_made_author_url.findAll('div', attrs={'class':'byline'}) try: made_time = lists[0].contents[1].contents[2].strip('onby') made_time = made_time.strip() made_author_url = lists[0].contents[3]['href'] mades_author_urls.append({pdl.made_url:made_url, pdl.made_time:made_time, pdl.made_author_url:made_author_url}) except Exception, what: print what continue page_dict[pdl.thing_mades] = mades_author_urls if len(deriveds) > 0: page_dict[pdl.thing_deriveds] = deriveds
def s_made(soup, page_dict): #lists = soup.findAll('form', attrs={'id':'i_made_one_form'}) lists = soup.findAll('div', attrs={'id': 'thing-made'}) #print len(lists) #print lists if lists: mades = [] deriveds = [] for l in lists: if l.contents[1].contents[0] == "Who's Made It?": try: if l.contents[5].contents[0]: made_count = l.contents[5].contents[0].strip( '\n\tView all copies') #print l.contents[5].contents #print made_count made_count = int(made_count) if made_count >= 20: made_lists_url = l.contents[5]['href'] made_lists_url = page_url_root + made_lists_url response, content = fetch.fetchio_single( made_lists_url) soup_lists = BeautifulSoup(content) page_lists = soup_lists.findAll( 'div', attrs={'class': 'pagination'}) page_lists = page_lists[0].contents[1].contents page_last = page_lists[len(page_lists) - 2] page_last = page_last.contents[0]['href'].strip() page_last = re.findall('/page:\d+', page_last) if page_last: page_last = page_last[0].strip('/page:') page_last = int(page_last) #print page_last page_first = 1 for page_url in range(page_first, page_last + 1, 1): page_url = made_lists_url + "/page:" + str( page_url) print page_url response, content = fetch.fetchio_single( page_url) soup_made = BeautifulSoup(content) made_lists = soup_made.findAll( 'div', attrs={'class': 'things'}) if len(made_lists) == 0: #print "hello:::::::::::::::::::", made_lists continue for li in made_lists[0].contents: if type(li) == Tag: #print ".........." #print li.contents if not li.contents: #print '+++++++++++++++' break made_href = li.contents[ 1].contents[1]['href'] #print made_href made_href = page_url_root + made_href mades.append( {pdl.made_url: made_href}) else: lists_made = l.contents[3].contents for lm in lists_made: if type(lm) == Tag: #print lm made_href = lm.contents[1]['href'] mades.append({pdl.made_url: made_href}) #print made_href #print "**********" except Exception, what: print what continue if l.contents[1].contents[0] == "Who's Derived It?": try: if l.contents[5].contents[0]: derivation_count = l.contents[5].contents[0].strip( '\n\tView all variations') derivation_count = int(derivation_count) if derivation_count >= 20: #print derivation_count derivation_lists_url = l.contents[5]['href'] #url_root = 'http://www.thingiverse.com' derivation_lists_url = page_url_root + derivation_lists_url response, content = fetch.fetchio_single( derivation_lists_url) soup_lists = BeautifulSoup(content) page_lists = soup_lists.findAll( 'div', attrs={'class': 'pagination'}) page_lists = page_lists[0].contents[1].contents page_last = page_lists[len(page_lists) - 2] page_last = page_last.contents[0]['href'].strip() page_last = re.findall('/page:\d+', page_last) if page_last: page_last = page_last[0].strip('/page:') page_last = int(page_last) #print page_last page_first = 1 for page_url in range(page_first, page_last + 1, 1): page_url = derivation_lists_url + "/page:" + str( page_url) print page_url response, content = fetch.fetchio_single( page_url) if int(response['status']) != 200: print 'each derivation page is not available' soup_derived = BeautifulSoup(content) derived_lists = soup_derived.findAll( 'div', attrs={'class': 'things'}) if len(derived_lists) == 0: continue for li in derived_lists[0].contents: if type(li) == Tag: #print ".........." #print li if not li.contents: #print '+++++++++++++++' break derived_href = li.contents[ 1].contents[1]['href'] #print derived_href deriveds.append({ pdl.derived_url: derived_href }) #print "*****************" #sys.exit() #print "finish" else: lists_derived = l.contents[3].contents for lm in lists_derived: if type(lm) == Tag: #print lm derived_href = lm.contents[1]['href'] deriveds.append( {pdl.derived_url: derived_href}) #print derived_href #print "***==========" except Exception, what: print what continue
for lm in lists_derived: if type(lm) == Tag: #print lm derived_href = lm.contents[1]['href'] deriveds.append( {pdl.derived_url: derived_href}) #print derived_href #print "***==========" except Exception, what: print what continue if len(mades) > 0: mades_author_urls = [] for m in mades: made_url = m[pdl.made_url] response, content = fetch.fetchio_single(made_url) soup_made_author_url = BeautifulSoup(content) lists = soup_made_author_url.findAll('div', attrs={'class': 'byline'}) try: made_time = lists[0].contents[1].contents[2].strip('onby') made_time = made_time.strip() made_author_url = lists[0].contents[3]['href'] mades_author_urls.append({ pdl.made_url: made_url, pdl.made_time: made_time, pdl.made_author_url: made_author_url })
def web_reading_threads(): fetch.fetch_multi_once_thread_pool(200, 100, 100, web_read.content_scripting, web_processor.page_processing)