Ejemplo n.º 1
0
def s_thing_widget(soup, page_dict):# more example would come back later
    lists = soup.findAll('div', attrs={'id':'facebook_share_button'})
    print len(lists)
    url_widget = lists[0].contents[1]['src']
    response, content = fetch.fetchio_single(url_widget)
    if True:
        #print content
        soup_widget = BeautifulSoup(content)
        lists_widget = soup_widget.findAll('div', attrs={'class':'connect_widget_button_count_count'})
        widget_count = lists_widget[1].contents[0]
        print widget_count
Ejemplo n.º 2
0
def s_thing_widget(soup, page_dict):  # more example would come back later
    lists = soup.findAll('div', attrs={'id': 'facebook_share_button'})
    print len(lists)
    url_widget = lists[0].contents[1]['src']
    response, content = fetch.fetchio_single(url_widget)
    if True:
        #print content
        soup_widget = BeautifulSoup(content)
        lists_widget = soup_widget.findAll(
            'div', attrs={'class': 'connect_widget_button_count_count'})
        widget_count = lists_widget[1].contents[0]
        print widget_count
Ejemplo n.º 3
0
def people_check(url):
    sql_id = 'SELECT id FROM people WHERE url=?'
    c = conn.cursor()
    c.execute(sql_id, (url,))
    people_id = c.fetchone()
    #print "***", people_id
    #print "***", people_id[0]
    if people_id:
        return people_id[0]
    else:
        name = ''
        register_time = ''
        response, content = fetch.fetchio_single(url)
        #print content
        if True:
        #response, content = http.request(url, 'GET')
        #if int(response['status']) == 200:
            soup = BeautifulSoup(content)
            lists = soup.findAll('div', attrs={'id':'user-meta'})
            if lists:
                #print lists[0].contents
                name = lists[0].contents[1].contents[0]
                #print name
                #print lists[0].contents[3].contents
                #print "======"
                for l in lists[0].contents[3].contents:
                    if type(l) == Tag:
                        for r in l.contents:
                            if type(r) == Tag:
                                #print r.contents
                                temp = r.contents[0].strip('\n\t')
                                #print temp
                                t = re.findall('Registered on', temp)
                                if t:
                                    register_time = temp.strip('Registered on')
                                    #print register_time
                                    break
                                #tag_number = tag_number.strip('()')
                                #print t
                                #print "****"
                        #print "======"
        else:
            print "user's profile is not existing:"+url
        sql = 'INSERT INTO people (name, url, register_time) VALUES (?, ?, ?)'
        param = (name, url, register_time, )
        #print param
        c.execute(sql, param)
        conn.commit()
        c.execute(sql_id, (url,))
        people_id = c.fetchone()
        if people_id:
            return people_id[0]
    c.close()
Ejemplo n.º 4
0
def people_check(url):
    sql_id = "SELECT id FROM people WHERE url=?"
    c = conn.cursor()
    c.execute(sql_id, (url,))
    people_id = c.fetchone()
    # print "***", people_id
    # print "***", people_id[0]
    if people_id:
        return people_id[0]
    else:
        name = ""
        register_time = ""
        response, content = fetch.fetchio_single(url)
        # print content
        if True:
            # response, content = http.request(url, 'GET')
            # if int(response['status']) == 200:
            soup = BeautifulSoup(content)
            lists = soup.findAll("div", attrs={"id": "user-meta"})
            if lists:
                # print lists[0].contents
                name = lists[0].contents[1].contents[0]
                # print name
                # print lists[0].contents[3].contents
                # print "======"
                for l in lists[0].contents[3].contents:
                    if type(l) == Tag:
                        for r in l.contents:
                            if type(r) == Tag:
                                # print r.contents
                                temp = r.contents[0].strip("\n\t")
                                # print temp
                                t = re.findall("Registered on", temp)
                                if t:
                                    register_time = temp.strip("Registered on")
                                    # print register_time
                                    break
                                # tag_number = tag_number.strip('()')
                                # print t
                                # print "****"
                        # print "======"
        else:
            print "user's profile is not existing:" + url
        sql = "INSERT INTO people (name, url, register_time) VALUES (?, ?, ?)"
        param = (name, url, register_time)
        # print param
        c.execute(sql, param)
        conn.commit()
        c.execute(sql_id, (url,))
        people_id = c.fetchone()
        if people_id:
            return people_id[0]
    c.close()
Ejemplo n.º 5
0
def s_comments(soup, index, page_dict):
    #lists = soup.findAll('div', attrs={'id':'thing-comments'})
    #print len(lists)
    #print lists
    #http://js-kit.com/comments-data.js?ref=http%3A%2F%2Fwww.thingiverse.com%2Fthing%3A17773&p[0]=%2Fthing%3A17773&permalink[0]=http%3A%2F%2Fwww.thingiverse.com%2Fthing%3A17773
    #http://js-kit.com/comments-data.js?ref=http%3A%2F%2Fwww.thingiverse.com%2Fthing%3A17773&p[0]=%2Fthing%3A17773&permalink[0]=http%3A%2F%2Fwww.thingiverse.com%2Fthing%3A17773&jx[0]=0
    #http://js-kit.com/comments-data.js?ref=http%3A%2F%2Fwww.thingiverse.com%2Fthing%3A17773&p[0]=%2Fthing%3A17773&jx[0]=0
    #http://js-kit.com/comments-data.js?ref=http://www.thingiverse.com/thing:17773&p[0]=/thing:17773&jx[0]=0
    url = 'http://js-kit.com/comments-data.js?ref=http://www.thingiverse.com/thing:'+str(index)+'&p[0]=/thing:'+str(index)+'&jx[0]=0'
    print url
    response, content = fetch.fetchio_single(url)
    print response
    print content
    if int(response['status']) == 200:
        print content
Ejemplo n.º 6
0
def s_comments(soup, index, page_dict):
    #lists = soup.findAll('div', attrs={'id':'thing-comments'})
    #print len(lists)
    #print lists
    #http://js-kit.com/comments-data.js?ref=http%3A%2F%2Fwww.thingiverse.com%2Fthing%3A17773&p[0]=%2Fthing%3A17773&permalink[0]=http%3A%2F%2Fwww.thingiverse.com%2Fthing%3A17773
    #http://js-kit.com/comments-data.js?ref=http%3A%2F%2Fwww.thingiverse.com%2Fthing%3A17773&p[0]=%2Fthing%3A17773&permalink[0]=http%3A%2F%2Fwww.thingiverse.com%2Fthing%3A17773&jx[0]=0
    #http://js-kit.com/comments-data.js?ref=http%3A%2F%2Fwww.thingiverse.com%2Fthing%3A17773&p[0]=%2Fthing%3A17773&jx[0]=0
    #http://js-kit.com/comments-data.js?ref=http://www.thingiverse.com/thing:17773&p[0]=/thing:17773&jx[0]=0
    url = 'http://js-kit.com/comments-data.js?ref=http://www.thingiverse.com/thing:' + str(
        index) + '&p[0]=/thing:' + str(index) + '&jx[0]=0'
    print url
    response, content = fetch.fetchio_single(url)
    print response
    print content
    if int(response['status']) == 200:
        print content
Ejemplo n.º 7
0
def s_made(soup, page_dict):
    #lists = soup.findAll('form', attrs={'id':'i_made_one_form'})
    lists = soup.findAll('div', attrs={'id':'thing-made'})
    #print len(lists)
    #print lists
    if lists:
        mades = []
        deriveds = []
        for l in lists:
            if l.contents[1].contents[0] == "Who's Made It?":
                try:
                    if l.contents[5].contents[0]:
                        made_count = l.contents[5].contents[0].strip('\n\tView all copies')
                        #print l.contents[5].contents
                        #print made_count
                        made_count = int(made_count)
                        if made_count >= 20:
                            made_lists_url = l.contents[5]['href']
                            made_lists_url = page_url_root+made_lists_url
                            response, content = fetch.fetchio_single(made_lists_url)
                            soup_lists = BeautifulSoup(content)
                            page_lists = soup_lists.findAll('div', attrs={'class':'pagination'})
                            page_lists = page_lists[0].contents[1].contents
                            page_last = page_lists[len(page_lists)-2]
                            page_last = page_last.contents[0]['href'].strip()
                            page_last = re.findall('/page:\d+', page_last)
                            if page_last:
                                page_last = page_last[0].strip('/page:')
                                page_last = int(page_last)
                                #print page_last
                                page_first = 1
                                for page_url in range(page_first, page_last+1, 1):
                                    page_url = made_lists_url+"/page:"+str(page_url)
                                    print page_url
                                    response, content = fetch.fetchio_single(page_url)
                                    soup_made = BeautifulSoup(content)
                                    made_lists = soup_made.findAll('div', attrs={'class':'things'})
                                    if len(made_lists) == 0:
                                        #print "hello:::::::::::::::::::", made_lists
                                        continue
                                    for li in made_lists[0].contents:
                                        if type(li) == Tag:
                                            #print ".........."
                                            #print li.contents
                                            if not li.contents:
                                                #print '+++++++++++++++'
                                                break
                                            made_href = li.contents[1].contents[1]['href']
                                            #print made_href
                                            made_href = page_url_root+made_href
                                            mades.append({pdl.made_url:made_href})
                        else:
                            lists_made = l.contents[3].contents
                            for lm in lists_made:
                                if type(lm) == Tag:
                        #print lm
                                    made_href = lm.contents[1]['href']
                                    mades.append({pdl.made_url:made_href})
                        #print made_href
                        #print "**********"
                except Exception, what:
                    print what
                    continue
            if l.contents[1].contents[0] == "Who's Derived It?":
                try:
                    if l.contents[5].contents[0]:
                        derivation_count = l.contents[5].contents[0].strip('\n\tView all variations')
                        derivation_count = int(derivation_count)
                        if derivation_count >= 20:
                            #print derivation_count
                            derivation_lists_url = l.contents[5]['href']
                            #url_root = 'http://www.thingiverse.com'
                            derivation_lists_url = page_url_root+derivation_lists_url
                            response, content = fetch.fetchio_single(derivation_lists_url)
                            soup_lists = BeautifulSoup(content)
                            page_lists = soup_lists.findAll('div', attrs={'class':'pagination'})
                            page_lists = page_lists[0].contents[1].contents
                            page_last = page_lists[len(page_lists)-2]
                            page_last = page_last.contents[0]['href'].strip()
                            page_last = re.findall('/page:\d+', page_last)
                            if page_last:
                                page_last = page_last[0].strip('/page:')
                                page_last = int(page_last)
                                #print page_last
                                page_first = 1
                                for page_url in range(page_first, page_last+1, 1):
                                    page_url = derivation_lists_url+"/page:"+str(page_url)
                                    print page_url
                                    response, content = fetch.fetchio_single(page_url)
                                    if int(response['status']) != 200:
                                        print 'each derivation page is not available'
                                    soup_derived = BeautifulSoup(content)
                                    derived_lists = soup_derived.findAll('div', attrs={'class':'things'})
                                    if len(derived_lists) == 0:
                                        continue
                                    for li in derived_lists[0].contents:
                                        if type(li) == Tag:
                                            #print ".........."
                                            #print li
                                            if not li.contents:
                                                #print '+++++++++++++++'
                                                break
                                            derived_href = li.contents[1].contents[1]['href']
                                            #print derived_href
                                            deriveds.append({pdl.derived_url:derived_href})
                                            #print "*****************"
                                    #sys.exit()
                                    #print "finish"
                        else:
                            lists_derived = l.contents[3].contents
                            for lm in lists_derived:
                                if type(lm) == Tag:
                        #print lm
                                    derived_href = lm.contents[1]['href']
                                    deriveds.append({pdl.derived_url:derived_href})
                        #print derived_href
                        #print "***=========="
                except Exception, what:
                    print what
                    continue
Ejemplo n.º 8
0
                            lists_derived = l.contents[3].contents
                            for lm in lists_derived:
                                if type(lm) == Tag:
                        #print lm
                                    derived_href = lm.contents[1]['href']
                                    deriveds.append({pdl.derived_url:derived_href})
                        #print derived_href
                        #print "***=========="
                except Exception, what:
                    print what
                    continue
        if len(mades) > 0:
            mades_author_urls = []
            for m in mades:
                made_url = m[pdl.made_url]
                response, content = fetch.fetchio_single(made_url)
                soup_made_author_url = BeautifulSoup(content)
                lists = soup_made_author_url.findAll('div', attrs={'class':'byline'})
                try:
                    made_time = lists[0].contents[1].contents[2].strip('onby')
                    made_time = made_time.strip()
                    made_author_url = lists[0].contents[3]['href']
                    mades_author_urls.append({pdl.made_url:made_url, pdl.made_time:made_time, pdl.made_author_url:made_author_url})
                except Exception, what:
                    print what
                    continue
            page_dict[pdl.thing_mades] = mades_author_urls
        if len(deriveds) > 0:
            page_dict[pdl.thing_deriveds] = deriveds

Ejemplo n.º 9
0
def s_made(soup, page_dict):
    #lists = soup.findAll('form', attrs={'id':'i_made_one_form'})
    lists = soup.findAll('div', attrs={'id': 'thing-made'})
    #print len(lists)
    #print lists
    if lists:
        mades = []
        deriveds = []
        for l in lists:
            if l.contents[1].contents[0] == "Who's Made It?":
                try:
                    if l.contents[5].contents[0]:
                        made_count = l.contents[5].contents[0].strip(
                            '\n\tView all copies')
                        #print l.contents[5].contents
                        #print made_count
                        made_count = int(made_count)
                        if made_count >= 20:
                            made_lists_url = l.contents[5]['href']
                            made_lists_url = page_url_root + made_lists_url
                            response, content = fetch.fetchio_single(
                                made_lists_url)
                            soup_lists = BeautifulSoup(content)
                            page_lists = soup_lists.findAll(
                                'div', attrs={'class': 'pagination'})
                            page_lists = page_lists[0].contents[1].contents
                            page_last = page_lists[len(page_lists) - 2]
                            page_last = page_last.contents[0]['href'].strip()
                            page_last = re.findall('/page:\d+', page_last)
                            if page_last:
                                page_last = page_last[0].strip('/page:')
                                page_last = int(page_last)
                                #print page_last
                                page_first = 1
                                for page_url in range(page_first,
                                                      page_last + 1, 1):
                                    page_url = made_lists_url + "/page:" + str(
                                        page_url)
                                    print page_url
                                    response, content = fetch.fetchio_single(
                                        page_url)
                                    soup_made = BeautifulSoup(content)
                                    made_lists = soup_made.findAll(
                                        'div', attrs={'class': 'things'})
                                    if len(made_lists) == 0:
                                        #print "hello:::::::::::::::::::", made_lists
                                        continue
                                    for li in made_lists[0].contents:
                                        if type(li) == Tag:
                                            #print ".........."
                                            #print li.contents
                                            if not li.contents:
                                                #print '+++++++++++++++'
                                                break
                                            made_href = li.contents[
                                                1].contents[1]['href']
                                            #print made_href
                                            made_href = page_url_root + made_href
                                            mades.append(
                                                {pdl.made_url: made_href})
                        else:
                            lists_made = l.contents[3].contents
                            for lm in lists_made:
                                if type(lm) == Tag:
                                    #print lm
                                    made_href = lm.contents[1]['href']
                                    mades.append({pdl.made_url: made_href})
                        #print made_href
                        #print "**********"
                except Exception, what:
                    print what
                    continue
            if l.contents[1].contents[0] == "Who's Derived It?":
                try:
                    if l.contents[5].contents[0]:
                        derivation_count = l.contents[5].contents[0].strip(
                            '\n\tView all variations')
                        derivation_count = int(derivation_count)
                        if derivation_count >= 20:
                            #print derivation_count
                            derivation_lists_url = l.contents[5]['href']
                            #url_root = 'http://www.thingiverse.com'
                            derivation_lists_url = page_url_root + derivation_lists_url
                            response, content = fetch.fetchio_single(
                                derivation_lists_url)
                            soup_lists = BeautifulSoup(content)
                            page_lists = soup_lists.findAll(
                                'div', attrs={'class': 'pagination'})
                            page_lists = page_lists[0].contents[1].contents
                            page_last = page_lists[len(page_lists) - 2]
                            page_last = page_last.contents[0]['href'].strip()
                            page_last = re.findall('/page:\d+', page_last)
                            if page_last:
                                page_last = page_last[0].strip('/page:')
                                page_last = int(page_last)
                                #print page_last
                                page_first = 1
                                for page_url in range(page_first,
                                                      page_last + 1, 1):
                                    page_url = derivation_lists_url + "/page:" + str(
                                        page_url)
                                    print page_url
                                    response, content = fetch.fetchio_single(
                                        page_url)
                                    if int(response['status']) != 200:
                                        print 'each derivation page is not available'
                                    soup_derived = BeautifulSoup(content)
                                    derived_lists = soup_derived.findAll(
                                        'div', attrs={'class': 'things'})
                                    if len(derived_lists) == 0:
                                        continue
                                    for li in derived_lists[0].contents:
                                        if type(li) == Tag:
                                            #print ".........."
                                            #print li
                                            if not li.contents:
                                                #print '+++++++++++++++'
                                                break
                                            derived_href = li.contents[
                                                1].contents[1]['href']
                                            #print derived_href
                                            deriveds.append({
                                                pdl.derived_url:
                                                derived_href
                                            })
                                            #print "*****************"
                                    #sys.exit()
                                    #print "finish"
                        else:
                            lists_derived = l.contents[3].contents
                            for lm in lists_derived:
                                if type(lm) == Tag:
                                    #print lm
                                    derived_href = lm.contents[1]['href']
                                    deriveds.append(
                                        {pdl.derived_url: derived_href})
                        #print derived_href
                        #print "***=========="
                except Exception, what:
                    print what
                    continue
Ejemplo n.º 10
0
                     for lm in lists_derived:
                         if type(lm) == Tag:
                             #print lm
                             derived_href = lm.contents[1]['href']
                             deriveds.append(
                                 {pdl.derived_url: derived_href})
                 #print derived_href
                 #print "***=========="
         except Exception, what:
             print what
             continue
 if len(mades) > 0:
     mades_author_urls = []
     for m in mades:
         made_url = m[pdl.made_url]
         response, content = fetch.fetchio_single(made_url)
         soup_made_author_url = BeautifulSoup(content)
         lists = soup_made_author_url.findAll('div',
                                              attrs={'class': 'byline'})
         try:
             made_time = lists[0].contents[1].contents[2].strip('onby')
             made_time = made_time.strip()
             made_author_url = lists[0].contents[3]['href']
             mades_author_urls.append({
                 pdl.made_url:
                 made_url,
                 pdl.made_time:
                 made_time,
                 pdl.made_author_url:
                 made_author_url
             })
Ejemplo n.º 11
0
def web_reading_threads():
    fetch.fetch_multi_once_thread_pool(200, 100, 100, web_read.content_scripting, web_processor.page_processing)