Python BeautifulSoup.findall Examples, bs4.BeautifulSoup.findall Python Examples

Example #1

0

Show file

def get_professor_info(id):
    url = FINAL_RATE_MY_PROFESSOR_URL + str(id)

    ### IDK WHAT THIS DOES?
    ### MAYBE CHECK THAT THE PROF IS THERE
    # if len(scrap_info_from_url(url, "div", "rating-breakdown")) == 0:
    #     return None

    soup = BeautifulSoup(requests.get(url).content, "html.parser")
    grades = soup.find_all("div", {"class": "grade"})
    first_name = soup.findall("span", {"class": "pfname"})[0].text.strip()
    last_name = soup.findall("span", {"class": "plname"})[0].text.strip()
    department = soup.findall("div", {"class": "result-title"})[0].text.strip()
    professor_info = {
        'first_name':
        first_name,
        'last_name':
        last_name,
        'department':
        department.split('department')[0].split('fessor in the')[1].strip(),
        'overall_quality':
        grades[0].text.strip(),
        'would_take_again':
        grades[1].text.strip(),
        'level_of_difficulty':
        grades[2].text.strip(),
        'url':
        url
    }
    return professor_info

Example #2

0

Show file

File: model.py Project: aufahr/covid19-bumn

    def execute(self):
        result = {'scraped_datetime': str(datetime.datetime.now())}
        try:
            if self.exec_type == 'GET':
                site = requests.get(self.url, verify=False)
                for key, selector in self.selectors.__dict__.items():
                    if selector is not None:
                        if selector.parser_type == 'bs4':
                            try:
                                page = BeautifulSoup(site.text,
                                                     features="lxml")

                                if selector.attribute is None:
                                    result[key.replace(
                                        '_selector', '')] = page.select_one(
                                            selector.selector).get_text()
                                else:
                                    result[key.replace(
                                        '_selector', '')] = page.select_one(
                                            selector.selector).get(
                                                selector.attribute)

                                if selector.processor is not None:
                                    result[key.replace(
                                        '_selector', '')] = selector.processor(
                                            page.select_one(
                                                selector.selector).get_text())
                            except Exception as e:
                                result[key.replace('_selector', '')] = str(e)
                        else:
                            try:
                                page = html.fromstring(site.text)
                                result[key.replace(
                                    '_selector', '')] = page.findall(
                                        selector.selector)[0].text
                                if selector.processor is not None:
                                    result[key.replace(
                                        '_selector', '')] = selector.processor(
                                            page.findall(
                                                selector.selector)[0].text)
                            except Exception as e:
                                result[key.replace('_selector', '')] = str(e)
            else:
                result = self.call()
        except Exception as e:
            print(e)

        if self.result_postprocessor is not None:
            result = self.result_postprocessor(result)

        result['region'] = self.region
        result['source_type'] = 'individual_website'
        print("{} : {}".format(self.region, result))
        return result

Example #3

0

Show file

File: laravelInstall.py Project: DALIHILLARY/laravelInstaller

def Xampp():
    #check if xampp install path exists
    
    dir = Path("/opt/lampp/")
    if not dir.exists():
        choice = str(input("Do you have xampp file on computer (YES,NO):"))
        if choice in ['yes','YES','y','Y']:
            print("Please move the file to  %s \n" %os.getcwd())
            
            input("PRESS ANY KEY WHEN COPY IS DONE:")
            os.system("mv *.run xampp.run")
       
        
        else:
                url = 'https://www.apachefriends.org'
                dst = 'xampp.run'
                r = requests.get(url)
                soup = BeautifulSoup(r.text,'html.parser')
                for x in soup.findall('a'):
                        link = x.get('href') #extract href links from results
                        if re.search("xampp-linux",link): #search for linux in links
                                url = link #assign url to the link gotten

                print("\033[31m.............DOWNLOADING XAMPP PLEASE WAIT...............")
                urlretrieve(url,dst,MyProgressBar())
        #instructions to execute after decision making 
        os.system("echo %s | sudo -S chmod +x xampp.run" %pwd)
        os.system("echo %s | sudo -S ./xampp.run" %pwd)
        os.system("echo %s | sudo -S ln -s /opt/lampp/bin/php /usr/local/bin/php" %pwd)
        os.system("echo %s | sudo -S /opt/lampp/xampp start" %pwd)

    else:
        print("\n xampp install already existing. installing other requirements ..... \n \n")

Example #4

0

Show file

def scrape():

    # Initialize browser

    browser = init_browser()

    # Create an empty dicitonary to store scraped dog restaurant data

    dog_scrapped_data = []

    ############################

    # Bring Fido dog friendly restaurants (https://www.bringfido.com/restaurant/city/san_francisco_ca_us/)

    dog_url = "https://www.bringfido.com/restaurant/city/san_francisco_ca_us/"
    browser.visit(dog_url)
    time.sleep(3)
    dog_html = browser.html
    dog_soup = BeautifulSoup(dog_html, 'html.parser')
    restaurant_names = dog_soup.findall("div", class_="info").find("span").text

    for restaurant in restaurant_names:
        dog_scrapped_data.append(restaurant)

    browser.quit()

    return dog_scrapped_data

Example #5

0

Show file

File: wikiparse.py Project: viloki9/aoide

class WikiParse(object):
    def __init__(self, **kwargs):
        self.keyword = kwargs.get('keyword')
        self.obj = wikipedia.page(self.keyword)
        self.soup = BeautifulSoup(self.obj.html())

    #WARNING: making the assumption that all band pages will
    #have a info box and all relevant information will be
    #provided under the Background Information subheader
    def find_background_info(self):
        infoboxes = self.soup.findall(attr={'class','infobox'})
        background_found = False
        for infobox in infoboxes:
            tr_list = infobox.find_all('tr')
            for tr in tr_list:
                th = tr.find_all('th')
                data = {}
                if not th:
                    continue
                th_str = th[0].string
                if not background_found:
                    if th_str and th_str == 'Background Information':
                        background_found = True
                else:
                    td = tr.find_all('td')
                    td_str = td.string
                    if td_str:
                        data[th_str] = td_str
                    else:
                        alinks = td.find_all('a')
                        data[th_str] = ','.join(alinks)


    def find_inspiration(self):
        sentences = self.obj.content.split('.')
        syn_list = SynWord.objects.get(name='inspiration').synonym
        key_sentences = []
        for sentence in sentences:
            for syn_word in syn_list:
                if syn_word.name in sentence:
                    key_sentences.append(sentence)
        return key_sentences

    def connected_nodes(self):
        sentence = self.find_inspiration()
        parents = self.find_connections(sentence)
        print 'PARENTS: %s' % parents


    #going to make the initial assumption that wikipedia
    #articles rarely mentions whom the band influenced for
    #the sake of getting this to work and to sidestep
    #the complexities of determining the influencer in the
    #sentence.
    def find_connections(self, sentence):
        #parents = []
        tokens = nltk.word_tokenize(sentence)
        tagged = nltk.pos_tag(tokens)
        proper_nouns = [x for x, y in tagged if y in KEY_NOUNS]
        return proper_nouns

Example #6

0

Show file

File: spidy_with_page.py Project: Saurabhpatil-dev/Generalized-Web-Crawlers

def spider(maximum_pages):
    page = 1
    while page <= maximum_pages:
        url = 'paste your web address here' + str(
            page)  # Paste address of the website to be crawled

        source_code = requests.get(
            url
        )  # source_code is a variable which will store the scrapped information from the url. requests.get() will collect all the information.

        plain_text = source_code.text  # source_code will be in html formt for our sake we will convert it into txt format.

        soup = BeautifulSoup(
            plain_text
        )  # BeautifulSoup constructor takes the document in the form dtring and parses the document and creates a corresponding data structure.

        for link in soup.findall(
                'a', {'': ''}
        ):  # This for loop will run until the crawler collects all the href from the class. The tag for links is " 'a' ". In the other colons you can add more details like class and sub class.

            href = "paste your web address here" + link.get(
                'href'
            )  # Here we connects the web address to href to create a complete link of that object.

            title = link.string  # It collects the title of the href .

            print(href)

            print(title)
        pages += 1  # Incrementing the page number.

Example #7

0

Show file

File: spot_get_cities.py Project: slopez15/ASU-CREU2016

def get_city_urls():
  #the base urls for scraping - the school list is paginated
  with open("spot_states.txt") as f:
    base_urls = f.read().splitlines()

  #create a list individual school district urls
  for url in base_urls:
    with open("spot_area_urls.txt") as f:
      states = f.read().splitlines()
    r = requests.get(url)
    data=r.text
    soup = BeautifulSoup(data, "html.parser")

    main_table = soup.findall(class_="table table-condensed table-striped table-hover text-left")


    for cur_link in all_links:
      cur_url = cur_link.get('href')
      for i in range(0, 50):
          print(states[i])
          length = len(states[i])
          if cur_url[0:length] == states[i] and len(cur_url)>length:
              school_district_urls.append(url_pre+cur_url+url_post)
              break

  write_to_csv(school_district_urls,"school_district_urls2.csv")

Example #8

0

Show file

File: beautifulsoup.py Project: ChesleyTan/Instant-Answers-Screenscraper

def bsoup():
    r = requests.get("https://answers.yahoo.com/question/index?qid=20080613085817AAqvcNW")
    soup = BeautifulSoup (open(r.content))
    string = soup.findall("div", {"class":"group"})
    print soup.finalall(re.compile("^[A-Z]"))
    print soup.prettify()
    print string

Example #9

0

Show file

File: scraper.py Project: MarkSecada/women-in-magazine-features

def get_new_yorker_reports(end_year=2010):
    base_url = 'https://www.newyorker.com/magazine/reporting/page/'
    page_num = 1
    conn = sqlite3.connect('./magazine_features.db')
    cur = conn.cursor()
    url = f'{base_url}{page_num}'
    while True:
        r = requests.get(url)
        page = BeautifulSoup(r.text, features='html.parser')
        articles = page.findall('li', class_='River__riverItem___3huWr')
        for article in articles:
            issue_date = article.find('div', class_='River__issueDate___2DPuc')
            issue_link = issue_date.find('a',
                                         class_='Link__link___3dWao')['href']

            if int(issue_link.split('/')[2]) < end_year:
                break

            date = '-'.join(issue_link.split('/')[2:])

            byline_div = article.find('div', class_='Byline__by___37lv8')
            authors = byline_div.findall('a')
            for author in authors:
                row = ('new_yorker', date, author.text.a)
                cur.execute('INSERT INTO authors VALUES (?,?,?)', row)
                conn.commit()
                print(f'Inserted for issue {date}')
        page_num += 1
    conn.close()
    return 'success!'

Example #10

0

Show file

File: spot_get_cities.py Project: wangmiao1981/ASU-CREU2016

def get_city_urls():
    #the base urls for scraping - the school list is paginated
    with open("spot_states.txt") as f:
        base_urls = f.read().splitlines()

    #create a list individual school district urls
    for url in base_urls:
        with open("spot_area_urls.txt") as f:
            states = f.read().splitlines()
        r = requests.get(url)
        data = r.text
        soup = BeautifulSoup(data, "html.parser")

        main_table = soup.findall(
            class_="table table-condensed table-striped table-hover text-left")

        for cur_link in all_links:
            cur_url = cur_link.get('href')
            for i in range(0, 50):
                print(states[i])
                length = len(states[i])
                if cur_url[0:length] == states[i] and len(cur_url) > length:
                    school_district_urls.append(url_pre + cur_url + url_post)
                    break

    write_to_csv(school_district_urls, "school_district_urls2.csv")

Example #11

0

Show file

File: get_thumbnail.py Project: stets/ieddit

def main():
    #url = 'https://imgur.com/a/LpH5UiD'
    #tid = 1
    c = False
    tid = int(sys.argv[1])
    url = urllib.parse.unquote(sys.argv[2])

    r = requests.get(url, proxies=config.PROXIES, allow_redirects=True)

    if r.headers['Content-Type'].split('/')[0] == 'image':
        #ext = r.headers['Content-Type'].split('/')[1]
        create_thumbnail(r, tid)
        c = True
    else:
        soup = BeautifulSoup(r.text)
        image = soup.find('meta', property='og:image')
        try:
            iurl = image.get('content', None)
            r = requests.get(iurl,
                             proxies=config.PROXIES,
                             allow_redirects=True)
            soup = BeautifulSoup(r.text)
            if r.headers['Content-Type'].split('/')[0] == 'image':
                #ext = r.headers['Content-Type'].split('/')[1]
                create_thumbnail(r, tid)

        except:
            try:
                icon_link = soup.find("link", rel="shortcut icon")
                r = requests.get(icon_link['href'],
                                 proxies=config.PROXIES,
                                 allow_redirects=True)
                soup = BeautifulSoup(r.text)
                create_thumbnail(r, tid)
            except:
                i = soup.findall('img')
                guess = 0
                src = ''
                limit = 0
                for im in i:
                    try:
                        limit += 1
                        if limit > 15:
                            break
                        try:
                            height = int(im.attrs.get('height', None))
                            width = int(im.attrs.get('width', None))
                        except:
                            height = 1
                            width = 1
                        isrc = im.attrs.get('src', None)
                        if (height * width) > guess:
                            src = isrc
                            guess = height * width
                    except:
                        pass
                if src != '':
                    r = requests.get(i['href'], proxies=config.PROXIES)
                    create_thumbnail(r, tid)

Example #12

0

Show file

File: beautifulsoup.py Project: nikcolovic/Instant-Answers-Screenscraper

def bsoup():
    r = requests.get(
        "https://answers.yahoo.com/question/index?qid=20080613085817AAqvcNW")
    soup = BeautifulSoup(open(r.content))
    string = soup.findall("div", {"class": "group"})
    print soup.finalall(re.compile("^[A-Z]"))
    print soup.prettify()
    print string

Example #13

0

Show file

File: test_PBS_1672.py Project: parksandwildlife/pbs

 def xtest_we_have_the_right_page(self):
     r = self.client.get(self.url)
     soup = BeautifulSoup(r.content)
     req_text = "Add Fire Behaviour Calculation document"
     h1_tags = soup.find_all('h1')
     print(h1_tags)
     assert False
     assert any(tag.string == req_text for tag in soup.findall('h1'))

Example #14

0

Show file

File: utils.py Project: shawnxue/python-utils

def print_to_text(base_url):
    import requests
    from bs4 import BeautifulSoup
    r = requests.get(base_url)
    soup = BeautifulSoup(r.text, features="html.parser")
    with open("less.txt", "w") as f:
        for paragraph in soup.findall(dir='ltr'):
            f.write(paragraph.text.replace("<span>", ""))

Example #15

0

Show file

File: test_PBS_1672.py Project: ropable/pbs

 def xtest_we_have_the_right_page(self):
     r = self.client.get(self.url)
     soup = BeautifulSoup(r.content)
     req_text = "Add Fire Behaviour Calculation document"
     h1_tags = soup.find_all('h1')
     print(h1_tags)
     assert False
     assert any(tag.string == req_text for tag in soup.findall('h1'))

Example #16

0

Show file

File: lib.py Project: hama777/mint

def AnalizeReserveList() :
    topurl = "https://www.lib.city.kobe.jp/opac/opacs/reservation_cancel_confirmation?reservation_order_confirmation=%e9%a0%86%e4%bd%8d%e7%a2%ba%e8%aa%8d"

    html = open("book.htm","r").read()
    sp = BeautifulSoup(html)
    table = sp.findall("table")
    for row in rows:
        cell =row.findAll("td")[2]   # 予約番号は3カラム目
        print(cell)

Example #17

0

Show file

File: china_news.py Project: yuzhendi/haizei

 def parse(self, response):
     rss = BeautifulSoup(response.body, "html.parser")
     for item in rss.findall("item"):
         feed_item = ChinanewsCrawlItem()  #返回一个对象
         feed_item['title'] = item.title.text  # 后面的是页面代码中的内容
         feed_item['link'] = item.link.text
         feed_item['desc'] = item.description.text
         feed_item['pub_date'] = item.pubDate
         yield feed_item  #迭代对象

Example #18

0

Show file

File: Building a webCrawler.py Project: castsura8/Web_Crawler_Python

def get_single_item_data(item_url):
    source_code = requests.get(item_url)
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text)
    for item_name in soup.findAll('div', {'class': 'notranslate'}):
        print (item_name.string)
    for link in soup.findall('a'):
        href = "https://www.ebay.com" + link.get('href')
        print(href)

Example #19

0

Show file

File: lib.py Project: hama777/mint

def AnalizeReserveList():
    topurl = "https://www.lib.city.kobe.jp/opac/opacs/reservation_cancel_confirmation?reservation_order_confirmation=%e9%a0%86%e4%bd%8d%e7%a2%ba%e8%aa%8d"

    html = open("book.htm", "r").read()
    sp = BeautifulSoup(html)
    table = sp.findall("table")
    for row in rows:
        cell = row.findAll("td")[2]  # 予約番号は3カラム目
        print(cell)

Example #20

0

Show file

File: Web_Crawler.py Project: xhackerx/Web-Crawler

def get_single_item_date(item_url):
	source_code=requests.get(item_url)
	plain_text=source_code.text
	soup=BeautifulSoup(plain_text)
	for item_name in soup.findAll('div',{'class':'i-name'}):
		print(item_name.string)
	for link in soup.findall('a')
		href="http://www.imdb.com" + link.get('href)
		print(href)

Example #21

0

Show file

File: stockWebCrawler.py Project: tongch8819/Python_lib

def getStockList(lst, stockURL):
    html = getHTMLText(stockURL)
    soup = BeautifulSoup(html, 'html.parser')
    a = soup.findall('a')
    for i in a:
        try:
            href = i.attrs['href']
            lst.append(re.findall(r'[s][hz]\d{6}', href))
        except:
            continue

Example #22

0

Show file

File: Web_Crawler.py Project: xhackerx/Web-Crawler

def trade_spider(max_pages):
	url='http://www.imdb.com/chart/toptv/?ref_=nv_tvv_250_4/page+' + str(page)
	source_code=requests.get(url)
	plain_text=source_code.text
	soup=BeautifulSoup(plain_text)
	for link in soup.findall('a',{'class':'item.name}):
		href='http://www.imdb.com" + link.get('href')
		title=link.string
		print(href)
		print(title)
		get_single_item_date(href)

Example #23

0

Show file

File: Case3_StockData.py Project: Yabing67/Spider_BIT

def getStockList(lst,stockURl):
    html = getHTMLText(stockURL,'GB2312')
    soup = BeautifulSoup(stockURl,'parser.html')
    a = soup.findall('a')
    for i in a:#可能并不是所有的a标签都符合正则表达式的方式，中间可能会出现各种错误及异常，如果出现异常，可能不是要解析的范围，直接是程序继续运行即可
        try:
            href = i.attrs['href']
            lst.append(re.findall(r'[s][hz]\d{6}'),href)
        except:
            continue      
    return""

Example #24

0

Show file

def print_zillow_price(address, city, state):

    address = address.replace(', Cary', '')

    query_url = 'http://www.zillow.com/webservice/GetSearchResults.htm?zws-id={0}&address={1}&citystatezip={2}'
    query_url = query_url.format(ZILLOW_KEY, address, city + ',' + state)

    r = urllib.request.urlopen(query_url)
    doc = BeautifulSoup(r, 'xml')
    for item in doc.findall('message'):
        print(item)

Example #25

0

Show file

 def scrape(self):
     r = urllib.request.urlopen(self.site)
     html = r.read()
     parser = 'html.parser'
     sp = BeautifulSoup(html, parser)
     for tag in sp.findall('a'):
         url = tag.get('href')
         if url is None:
             continue
         if 'html' in url:
             print('\n' + url)

Example #26

0

Show file

def TLD_specific_search(document, is_raw_content):
	TLD = top_level_domain_pattern(document, is_raw_content)
	raw_content = document["raw_content"]
	# raw_content = document
	if TLD and raw_content:
		soup = BeautifulSoup(raw_content, 'html.parser')
		if TLD == "escortcafe.com":
			content = soup.find_all("div", class_="details")
	# print type(content)
	# print re.findall('Blonde', content)
		elif TLD == "classifriedads.com":
			content = soup.find_all(id="contentcell")
		elif TLD == "slixa.com":
			content = soup.find_all("div", class_="span9 profile-content") + soup.find_all("aside", class_="profile-sidebar span3")
		# elif TLD == "allsexyescort.com":
		elif TLD == "escort-ads.com":
			content = soup.findall("div", class_="container main-content vip-content")
		# elif TLD == "liveescortreviews.com":
		# elif TLD == "escort-europe.com":
		elif TLD == "find-escorts.com":
			content = soup.findall(id="contentcell")
		elif TLD == "escortserv.com":
			content = soup.findall(id="index")
		elif TLD == "slixa.ca":
			content = soup.find_all("div", class_="span9 profile-content") + soup.find_all("aside", class_="profile-sidebar span3")
		elif TLD == "escortpost.com":
			content = soup.findall(id="content")
		elif TLD == "privateescorts.ro":
			content = soup.findall("tbody")
		elif TLD == "adultsearch.com":
			content = soup.findall(id="ad")

		return str(content)	
	else:
			return ""

Example #27

0

Show file

File: main.py Project: SharonLingqiongTan/2016_Summer_CP4_GU

def TLD_specific_search(document):
    TLD = extraction.top_level_domain_pattern(document)
    raw_content = extraction.get_raw_content(document)
    if TLD and raw_content:
        soup = BeautifulSoup(raw_content, 'html.parser')
        content = ""
        if TLD == "escortcafe.com":
            content = soup.find_all("div", class_="details")
        elif TLD == "classifriedads.com":
            content = soup.find_all(id="contentcell")
        elif TLD == "slixa.com":
            content = soup.find_all("div", class_="span9 profile-content") + soup.find_all("aside", class_="profile-sidebar span3")
        # elif TLD == "allsexyescort.com":
        elif TLD == "escort-ads.com":
            content = soup.findall("div", class_="container main-content vip-content")
        # elif TLD == "liveescortreviews.com":
        # elif TLD == "escort-europe.com":
        elif TLD == "find-escorts.com":
            content = soup.findall(id="contentcell")
        elif TLD == "escortserv.com":
            content = soup.findall(id="index")
        elif TLD == "slixa.ca":
            content = soup.find_all("div", class_="span9 profile-content") + soup.find_all("aside", class_="profile-sidebar span3")
        elif TLD == "escortpost.com":
            content = soup.findall(id="content")
        elif TLD == "privateescorts.ro":
            content = soup.findall("tbody")
        elif TLD == "adultsearch.com":
            content = soup.findall(id="ad")
        return str(content)
    else:
        return ""

Example #28

0

Show file

def get_recipe_info(recipe_link):
    recipe_dict = dict()
    import requests
    from bs4 import BeautifulSoup
    try:
        response = requests.get(recipe_link)

        if not response.statuscode == 200:
            return recipe_dict

        results_page = BeautifulSoup(response.content, 'lxml')
        ingredient_list = list()
        prep_steps = list()
        for ingredients in results_page.findall('li', class_='ingredient'):
            ingredient_list.append(ingredients.get_text())
        for steps in results_page.findall('li', class_='preparation-step'):
            prep_steps.append(steps.get_text().strip())
        recipe_dict['ingredients'] = ingredient_list
        recipe_dict['preparation'] = prep_steps
        return recipe_dict
    except:
        return recipe_dict

Example #29

0

Show file

File: 25_build-web-crawler-1.py Project: kaiser-hamid-rabbi/computer-science-fundamentals

def trade_spider(max_pages):
    page = 1
    while page <= max_pages:
        url = 'https://buckysroom.org/trade/search.php?page=' + str(page)
        source_code = requests.get(url)
        plain_text = source_code.text
        soup = BeautifulSoup(plain_text)
        for link in soup.findall('a', {'class': 'item-name'}):
            href = 'https://buckysroom.org' + link.get('href')
            title = link.string
            print(href)
            print(title)
        page += 1

Example #30

0

Show file

File: get_thumbnail.py Project: lubed/ieddit

def main():
    c = False
    tid = int(sys.argv[1])
    url = urllib.parse.unquote(sys.argv[2])
    r = requests.get(url, proxies=config.PROXIES, allow_redirects=True)

    if r.headers['Content-Type'].split('/')[0] == 'image':
        create_thumbnail(r, tid)
        add_remote_image(url, tid)
        c = True
    else:
        soup = BeautifulSoup(r.text)
        image = soup.find('meta', property='og:image')
        try:
            iurl = image.get('content', None)
            r = requests.get(iurl,
                             proxies=config.PROXIES,
                             allow_redirects=True)
            soup = BeautifulSoup(r.text)
            if r.headers['Content-Type'].split('/')[0] == 'image':
                create_thumbnail(r, tid)
                add_remote_image(iurl, tid)

        except:
            try:
                raise Exception
            except:
                i = soup.findall('img')
                guess = 0
                src = ''
                limit = 0
                for im in i:
                    try:
                        limit += 1
                        if limit > 15:
                            break
                        try:
                            height = int(im.attrs.get('height', None))
                            width = int(im.attrs.get('width', None))
                        except:
                            height = 1
                            width = 1
                        isrc = im.attrs.get('src', None)
                        if (height * width) > guess:
                            src = isrc
                            guess = height * width
                    except:
                        pass
                if src != '':
                    r = requests.get(i['href'], proxies=config.PROXIES)
                    create_thumbnail(r, tid)

Example #31

0

Show file

def trade_spider(max_pages):
    page = 1
    while page < max_pages:
        url = 'https....' + str(page)
        source_code = requests.get(url)
        plain_text = source_code.txt
        soup = BeautifulSoup(plain_text)
        for link in soup.findall('a', {'class': 'item-name'}):
            href = "https...." + link.get('href')
            title = link.string
            # print(href)
            # print(title)
            get_single_item_data(href)
        page += 1

Example #32

0

Show file

File: 25_web_crawler.py Project: KSMaan/pythonprojects

def trade_spider(max_pages):
    page = 1
    while page < max_pages:
        url = 'https....' +str(page)
        source_code = requests.get(url)
        plain_text = source_code.txt
        soup = BeautifulSoup(plain_text)
        for link in soup.findall('a',{'class': 'item-name'}):
            href = "https...." + link.get('href')
            title = link.string
           # print(href)
           # print(title)
            get_single_item_data(href)
        page += 1

Example #33

0

Show file

def getpic(search):
	try:
		browser = mechanize.Browser()
		browser.set_handle_robot(False)
		browser.addheaders = [('User-agent','Mozilla')]
		
		htmltext = browser.open("http://www.google.com/?t=lm")
		img_urls = []
		soup = BeautifulSoup(htmltext)
		results = soup.findall("a")

		print results
	except:
		print "error"

Example #34

0

Show file

def mirrorImages(url, dir):
    ab = anonBrowser()
    ab.anonymize()
    html = ab.open(url)
    soup = BeautifulSoup(html)
    image_tags = soup.findall('img')
    for image in image_tags:
        filename = image['src'].lstrip('https://')
        filename = os.path.join(dir, filename.replace('/', '_'))
        print('[+] Saving ' + str(filename))
        data = ab.open(image['src']).read()
        ab.back()
        save = open(filename, 'wb')
        save.write(data)
        save.close()

Example #35

0

Show file

def summarize_best_books(filepath):
    """
    Write a function to get a list of categories, book title and URLs from the "BEST BOOKS OF 2020"
    page in "best_books_2020.htm". This function should create a BeautifulSoup object from a 
    filepath and return a list of (category, book title, URL) tuples.
    
    For example, if the best book in category "Fiction" is "The Testaments (The Handmaid's Tale, #2)", with URL
    https://www.goodreads.com/choiceawards/best-fiction-books-2020, then you should append 
    ("Fiction", "The Testaments (The Handmaid's Tale, #2)", "https://www.goodreads.com/choiceawards/best-fiction-books-2020") 
    to your list of tuples.
    """
    with open(filepath) as fp:
        soup = BeautifulSoup(fp, "html5lib")
    #print(soup)

    categories = soup.findall('h4', class_="category__copy")
    titles = soup.findall('a', class_="readable")
    urls = soup.findall('a', class_="readable")

    l_titles = []
    for t in titles:
        l_titles.append(t.text)

    l_categories = []
    for c in categories:
        l_categories.append(c.text)

    l_urls = []
    for u in urls:
        l_urls.append(u.text)

    l_tups = []
    for i in range(len(l_titles)):
        l_tups.append((l_categories[i], l_titles[i], l_urls[i]))

    return l_tups

Example #36

0

Show file

def parse_page(url_page, path, headers):  #下载一个page的48张图片
    global n
    req = getHtml(url_page, headers)  #获取html源代码
    html = req.text
    bf = BeautifulSoup(html, 'lxml')  #将html源码转换为BeautifulSoup对象

    page_url = bf.findall('div', class_='page')
    #targets_page_url =
    targets_url = bf.find_all(
        'div', class_='photo_card__grid')  #从中提取该页中所有的48张图片的详情页链接
    targets_url1 = targets_url[0].find_all('a')  #将所有链接保存到一个list中

    for each in targets_url1:  #提取并下载每张图片
        url_photo = each.get('href')
        parses_picturePage(url_photo, path)
        n = n + 1

Example #37

0

Show file

File: emag_scraper_oh_five.py Project: FabianT22/Final_project

def build_interface(prime_prod):
    spec_table = []
    if len(prime_prod) > 10:
        for i in range(0, 10):
            spec_page = requests.get(prime_prod[i].product_href,
                                     'html5lib').text
            spec_soup = BeautifulSoup(spec_page, "html5lib")
            # spec_table.append(spec_soup.find("div",{'class':'container pad-btm-lg'}).get)
            # spec_table.append(spec_soup.find("div", {'class': 'col-md-12'}).text)
            # st = spec_soup.find("div", {'class': 'col-md-12'}).get
            st = spec_soup.findall("div", {'class': 'pad-top-sm'})
            for item in st:
                spec_part = item.text
                spec_table.append(spec_part)
            # spec_table.append(st)
        return spec_table

Example #38

0

Show file

File: demo4.py Project: superriver/python_demo

def crawl(url):
    header = {
        'User-Agent':
        'Mozilla/5.0(Window;U;Windows NT 6.1;en-US;rv:1.9.1.6) Gecko/20091201 Frefox/3.5.6'
    }
    req = urllib.request.Request(url, headers=headers)
    page = urllib.request.urlopen(req, timeout=20)
    contents = page.read()
    soup = BeautifulSoup(contents)
    my_girl = soup.findall('img')
    for girl in my_girl:
        link = girl.get('src')
        print(link)
        contents = urllib.request.urlopen(link).read()
        with open(u'D:/meizi' + '/' + link[-11:], 'wb') as code:
            code.write(contents)

Example #39

0

Show file

def pull_oddshark_baseball(url, naming_standard):
	'''
	Gets the betting offers listed at an oddshark webpage
	This function should be used instead of pull_oddshark for baseball matches
	
	returns a dictionary of events
	'''
	months = ['January', 'February', 'March', 'April', 'May', 'June', 'July',
		'August', 'September', 'October', 'November', 'December']
	#use this for coverting string months into the conventional integer for that month
	#e.g. February = 2
	month_str_to_number = {months[i]: i+1 for i in range(len(months))}
	
	'''
	HTML Tags
	Team 1 win odds class = op-item op-spread border-bottom op-<BOOKIE_NAME>
	Team 2 win odds class = op-item op-spread op-<BOOKIE_NAME>
	
	Date class = op-separator-bar op-left no-group-name
		data-op-date = {"full_date":"Tuesday August 21","short_date":"Tue Aug 21","group_name":""}
	
	Team 1 class = op-matchup-team op-matchup-text op-team-top
		data-op-name = {"full_name":"Atlanta","short_name":"ATL"}
	Team 2 class = op-matchup-team op-matchup-text op-team-bottom
		data-op-name = {"full_name":"Pittsburgh","short_name":"PIT"}
	'''
	
	bookie_names = ['opening', 'bovada.lv', 'mybookie', 'intertops', 'betonline', 
		'caesars', '5dimes', 'westgate', 'topbet', 'sportsbetting', 'gtbets', 'betnow',
		'skybook', 'sportbet', 'station', 'mirage', 'wynn']
	team_1_win_class = 'op-item op-spread border-bottom op-{}'
	team_2_win_class = 'op-item op-spread op-{}'
	
	page = urllib.request.urlopen(url)

	soup = BeautifulSoup(page, 'html.parser')
	datetime_valid = datetime.datetime.now(datetime.timezone.utc)
	
	team_1_tags = soup.findall("div", class_ = "op-matchup-team op-matchup-text op-team-top")
	
	for team_1_tag in team_1_tags:
		pass

Example #40

0

Show file

File: mfwarticle.py Project: thirtyjohn/mfwanalyse

def fileToArticle(aid):
    html = open(tempArticleDir +"/"+ str(aid)[0:2] + "/" + str(aid)).read()
    soup = BeautifulSoup(html,from_encoding="utf8")

    #aid

    atype = None 
    if html.find(u"btn_Addpost") > -1:
        atype = 1
    elif html.find(u"op_btn") > -1:
        atype = 2

    title = unicode(soup.find("div","post_title").h1.string)
    
    author_div = soup.find("div","fl").a
    author = unicode(author_div.string)
    author_link = u"http://www.mafengwo.cn" + unicode(author_div["href"])

    wdate =  datetime.strptime(soup.find("span","date").string,"%Y-%m-%d %H:%M:%S")

    #replyCount
    r_div = soup.findall("div","post_item")
    replyCount = len(r_div) - 1
    reply_div = soup.find("div","turn_page").find("div","paginator")
    if reply_div:
        findall("a")
        reply_page = int(a[-2].string)
        replyCount = replyCount + (reply_page-1)*50


    location = None
    dl_location = soup.find("dl","related_mdd")
    if dl_location:
        location = unicode(dl_location.p.a.string)


    imgs,txt = getImgTexts(html)
    imgCount = len(imgs)
    txtCount = 0
    for t in txt:
        txtCount = txtCount + len(t)

Example #41

0

Show file

File: mysqlDB.py Project: cpeterson15/python-training

import time
import urllib.request

from bs4 import BeautifulSoup
from dbconnect import connection

req = urllib.request.urlopen('http://www.nationaljournal.com/politics?rss=1')

xml = BeautifulSoup(req, 'xml')

c, conn = connection()

for item in xml.findall('link')[3:]:
    url = item.text
    c.execute("INSERT INTO links (time, link) VALUES (%s, %s)",
              (time.time(), url))
    conn.commit()

conn.close()

Example #42

0

Show file

File: test_views.py Project: paulsheridan/learning-journal

def test_detail_response_html(dbtransaction, authenticated_app, new_entry):
    new_entry_id = new_entry.id
    response = authenticated_app.get('/entries/{}'.format(new_entry_id))
    soup = BeautifulSoup(response.html, 'html.parser')
    anchors = soup.findall('a')
    assert '<li class="tab"><a href="/entries/{{entry.id}}/edit">Edit</a></li>' in anchors

Example #43

0

Show file

File: before.py Project: Armgard/interview-questions

import requests
from bs4 import BeautifulSoup
from urlparse import urljoin


URL = 'http://philadelphia.craigslist.org/search/sss?sort=date&query=firefly%20tickets'
BASE = 'http://philadelphia.craigslist.org/cpg/'



response = requests.get(URL)



soup = BeautifulSoup(response.content)
for listing in soup.findall('p',{'class':'row'}):
    if listing.find('span',{'class':'price'}) != None:
        price = listing.text[2:6]
        price = int(price)
        if price <=250 and price > 100:
            print listing.text
            linkend = listing.a['href']
            url = urljoin(BASE, link_end)
            print url
            print "\n"

Example #44

0

Show file

File: pictures.py Project: staticor/python-enhance-life



source = 'http://mp.weixin.qq.com/s?__biz=MjM5MDkyOTI1OQ==&mid=209217907&idx=6&sn=9d975e5aca7a10bc70bec1e8b3dec3db&scene=5#rd'

# import requests
from urllib2 import urlopen

urlcontent = urlopen(source).read()


print urlcontent

from bs4 import BeautifulSoup

soup = BeautifulSoup(urlcontent)

# print dir(soup)

for p in soup.findall('p'):
    print p

# al = soup.findall()

Example #45

0

Show file

File: harvestblurb.py Project: marshallzhang/harvardlabs

from bs4 import BeautifulSoup

import urllib.request

urls = open("testurl.txt","r")

# for each page
for page in urls:
	soup = BeautifulSoup(page)
	woah = soup.findall("div",class_="biography").children
	print woah

Example #46

0

Show file

File: dealers (2).py Project: Venkateshwaran/ScreenScraping

import urllib2
import datetime
import re
import MySQLdb
import csv
from bs4 import BeautifulSoup as Soup

today = datetime.date.today()
html = urllib2.urlopen("http://www.99acres.com/property-in-velachery-chennai-south-ffid").read()

soup = Soup(html)
print "INSERT INTO Property (URL,Rooms, Place, Phonenumber1,Phonenumber2,Phonenumber3,Typeofperson, Name)"
print "VALUES ("
f = open('out.txt', 'w')
re_digit = re.compile('(\d+)')
pdate = soup.findall('i', {'class':'pdate'})
properties = soup.findAll('a', title=re.compile('Bedroom'))
for eachproperty in properties:
# title = today,","+"http:/"+ eachproperty['href']+",", eachproperty.string+"," +",".join(re.findall("'([a-zA-Z0-9,\s]*)'", eachproperty['onclick'])) 
	for eachdate in pdate:
		pdates = re.sub('(\s{2,})', ' ', eachdate.text)
	for div in soup.find_all('div', {'class': 'sT_disc grey'}):
		try:
			project = div.find('span').find('b').text.strip()
		except:
			project = 'No project'	
		area = re.findall(re_digit, div.find('i', {'class': 'blk'}).text.strip())

		print today,","+"http:/"+ eachproperty['href']+",", eachproperty.string+"," +",".join(re.findall("'([a-zA-Z0-9,\s]*)'", eachproperty['onclick']))+","+ ", ".join([project] + area),","+pdates
print "),"

Example #47

0

Show file

File: amazon_net_pg_src_scraper.py Project: lovelotus/Product-Price-Retriever

import webbrowser
from bs4 import BeautifulSoup
#from lxml import html

f=open('amazon_results.txt','r')
x=f.read()
soup=BeautifulSoup(x)
LMT=10
cnt=0
val=0
arr=[]
for val in range(1,LMT+1):
    v=""
    v+='result_'+str(val)
    print v
    for x in soup.findall('div',str(v)):
        for a in x.findall('a'):
            f=1
            for b in a.find_all('span','lrg bold'):
                c=b.get_text()
                if c:
                    p1="Product Name:"+c
                    print p1
            for b in a.find_all('span','med reg'):
                c=b.get_text()
                if c:
                    p2=c
                    print c
            for b in a.find_all('span','price bld'):
                c=b.get_text()
                if c:

Example #48

0

Show file

File: crawl.py Project: kshabahang/fbminer

def parseTimeline(html,username):
	soup = BeautifulSoup(html)	
	tlTime = soup.findAll("abbr")
	temp123 = soup.findAll("div",{"role" : "article"})
	placesCheckin = []
	timeOfPostList = []

	counter = 0

	for y in temp123:
		soup1 = BeautifulSoup(str(y))
		tlDateTimeLoc = soup1.findAll("a",{"class" : "uiLinkSubtle"})
		#Universal Time
		try:
			soup2 = BeautifulSoup(str(tlDateTimeLoc[0]))
			tlDateTime = soup2.find("abbr")	
			#Facebook Post Link	
			tlLink = tlDateTimeLoc[0]['href']

			try:
				tz = get_localzone()
				unixTime = str(tlDateTime['data-utime'])
				localTime = (datetime.datetime.fromtimestamp(int(unixTime)).strftime('%Y-%m-%d %H:%M:%S'))
				timePostList.append(localTime)
				timeOfPost = localTime
				timeOfPostList.append(localTime)

				print "[*] Time of Post: "+localTime
			except TypeError:
				continue
			if "posts" in tlLink:
				#print tlLink.strip()
				pageID = tlLink.split("/")

				parsePost(pageID[3],username)
				peopleIDLikes = parseLikesPosts(pageID[3])

				try:
					for id1 in peopleIDLikes:
						global peopleIDList
						global likesCountList
						if id1 in peopleIDList:
							lastCount = 0
							position = peopleIDList.index(id1)
							likesCountList[position] +=1
						else:
							peopleIDList.append(id1)
							likesCountList.append(1)
				except TypeError:
					continue
				
			if len(tlDateTimeLoc)>2:
				try:
					#Device / Location
					if len(tlDateTimeLoc[1].text)>0:
						print "[*] Location of Post: "+unicode(tlDateTimeLoc[1].text)
					if len(tlDateTimeLoc[2].text)>0:
						print "[*] Device: "+str(tlDateTimeLoc[2].text)
				except IndexError:
					continue	

			else:
				try:
					#Device / Location
					if len(tlDateTimeLoc[1].text)>0:
						if "mobile" in tlDateTimeLoc[1].text:
							print "[*] Device: "+str(tlDateTimeLoc[1].text)
						else:
							print "[*] Location of Post: "+unicode(tlDateTimeLoc[1].text)
					
				except IndexError:
					continue	
			#Facebook Posts
			tlPosts = soup1.find("span",{"class" : "userContent"})
			
			try:
				tlPostSec = soup1.findall("span",{"class" : "userContentSecondary fcg"})
				tlPostMsg = ""
			
				#Places Checked In
			except TypeError:
				continue
			soup3 = BeautifulSoup(str(tlPostSec))
			hrefLink = soup3.find("a")

			"""
			if len(str(tlPostSec))>0:
				tlPostMsg = str(tlPostSec)
				#if " at " in str(tlPostMsg) and " with " not in str(tlPostMsg):
				if " at " in str(tlPostMsg):
					print str(tlPostSec)

					print tlPostMsg
					#print hrefLink
					#placeUrl = hrefLink['href'].encode('utf8').split('?')[0]
					#print "[*] Place: "+placeUrl										
					#placesCheckin.append([timeOfPost,placeUrl])
			"""

			try:
				if len(tlPosts)>0:				
					tlPostStr = re.sub('<[^>]*>', '', str(tlPosts))
					if tlPostStr!=None:
						print "[*] Message: "+str(tlPostStr)
			except TypeError as e:
				continue


			tlPosts = soup1.find("div",{"class" : "translationEligibleUserMessage userContent"})
			try:
				if len(tlPosts)>0:
					tlPostStr = re.sub('<[^>]*>', '', str(tlPosts))
					print "[*] Message: "+str(tlPostStr)	
			except TypeError:
				continue
		except IndexError as e:
			continue
		counter+=1
	
	tlDeviceLoc = soup.findAll("a",{"class" : "uiLinkSubtle"})

	print '\n'

	global reportFileName
	if len(reportFileName)<1:
		reportFileName = username+"_report.txt"
	reportFile = open(reportFileName, "w")
	
	reportFile.write("\n********** Places Visited By "+str(username)+" **********\n")
	filename = username+'_placesVisited.htm'
	if not os.path.lexists(filename):
		html = downloadPlacesVisited(driver,uid)
		text_file = open(filename, "w")
		text_file.write(html.encode('utf8'))
		text_file.close()
	else:
		html = open(filename, 'r').read()
	dataList = parsePlacesVisited(html)
	count=1
	for i in dataList:
		reportFile.write(normalize(i[2])+'\t'+normalize(i[1])+'\t'+normalize(i[3])+'\n')
		count+=1
	
	reportFile.write("\n********** Places Liked By "+str(username)+" **********\n")
	filename = username+'_placesLiked.htm'
	if not os.path.lexists(filename):
		html = downloadPlacesLiked(driver,uid)
		text_file = open(filename, "w")
		text_file.write(html.encode('utf8'))
		text_file.close()
	else:
		html = open(filename, 'r').read()
	dataList = parsePlacesLiked(html)
	count=1
	for i in dataList:
		reportFile.write(normalize(i[2])+'\t'+normalize(i[1])+'\t'+normalize(i[3])+'\n')
		count+=1

	reportFile.write("\n********** Places checked in **********\n")
	for places in placesVisitedList:
		unixTime = places[0]
		localTime = (datetime.datetime.fromtimestamp(int(unixTime)).strftime('%Y-%m-%d %H:%M:%S'))
		reportFile.write(localTime+'\t'+normalize(places[1])+'\t'+normalize(places[2])+'\n')

	reportFile.write("\n********** Apps used By "+str(username)+" **********\n")
	filename = username+'_apps.htm'
	if not os.path.lexists(filename):
		html = downloadAppsUsed(driver,uid)
		text_file = open(filename, "w")
		text_file.write(html.encode('utf8'))
		text_file.close()
	else:
		html = open(filename, 'r').read()
	data1 = parseAppsUsed(html)
	result = ""
	for x in data1:
		reportFile.write(normalize(x)+'\n')
		x = x.lower()
		if "blackberry" in x:
			result += "[*] User is using a Blackberry device\n"
		if "android" in x:
			result += "[*] User is using an Android device\n"
		if "ios" in x or "ipad" in x or "iphone" in x:
			result += "[*] User is using an iOS Apple device\n"
		if "samsung" in x:
			result += "[*] User is using a Samsung Android device\n"
	reportFile.write(result)

	reportFile.write("\n********** Videos Posted By "+str(username)+" **********\n")
	filename = username+'_videosBy.htm'
	if not os.path.lexists(filename):
		html = downloadVideosBy(driver,uid)
		text_file = open(filename, "w")
		text_file.write(html.encode('utf8'))
		text_file.close()
	else:
		html = open(filename, 'r').read()
	dataList = parseVideosBy(html)
	count=1
	for i in dataList:
		reportFile.write(normalize(i[2])+'\t'+normalize(i[1])+'\n')
		count+=1

	reportFile.write("\n********** Pages Liked By "+str(username)+" **********\n")
	filename = username+'_pages.htm'
	if not os.path.lexists(filename):
		print "[*] Caching Pages Liked: "+username
		html = downloadPagesLiked(driver,uid)
		text_file = open(filename, "w")
		text_file.write(html.encode('utf8'))
		text_file.close()
	else:
		html = open(filename, 'r').read()
	dataList = parsePagesLiked(html)
	for i in dataList:
		pageName = normalize(i[0])
		tmpStr	= normalize(i[3])+'\t'+normalize(i[2])+'\t'+normalize(i[1])+'\n'
		reportFile.write(tmpStr)
	print "\n"

	c = conn.cursor()
	reportFile.write("\n********** Friendship History of "+str(username)+" **********\n")
	c.execute('select * from friends where sourceUID=?',(uid,))
	dataList = c.fetchall()
	try:
		if len(str(dataList[0][4]))>0:
			for i in dataList:
				#Date First followed by Username
				reportFile.write(normalize(i[4])+'\t'+normalize(i[3])+'\t'+normalize(i[2])+'\t'+normalize(i[1])+'\n')
				#Username followed by Date
				#reportFile.write(normalize(i[4])+'\t'+normalize(i[3])+'\t'+normalize(i[2])+'\t'+normalize(i[1])+'\n')
		print '\n'
	except IndexError:
		pass

	reportFile.write("\n********** Friends of "+str(username)+" **********\n")
	reportFile.write("*** Backtracing from Facebook Likes/Comments/Tags ***\n\n")
	c = conn.cursor()
	c.execute('select userName from friends where sourceUID=?',(uid,))
	dataList = c.fetchall()
	for i in dataList:
		reportFile.write(str(i[0])+'\n')
	print '\n'

	tempList = []
	totalLen = len(timeOfPostList)
	timeSlot1 = 0
	timeSlot2 = 0
	timeSlot3 = 0 
	timeSlot4 = 0
	timeSlot5 = 0 
	timeSlot6 = 0 
	timeSlot7 = 0 
	timeSlot8 = 0 

	count = 0
	if len(peopleIDList)>0:
		likesCountList, peopleIDList  = zip(*sorted(zip(likesCountList,peopleIDList),reverse=True))
	
		reportFile.write("\n********** Analysis of Facebook Post Likes **********\n")
		while count<len(peopleIDList):
			testStr = str(likesCountList[count]).encode('utf8')+'\t'+str(peopleIDList[count]).encode('utf8')
			reportFile.write(testStr+"\n")
			count+=1	

	reportFile.write("\n********** Analysis of Interactions between "+str(username)+" and Friends **********\n")
	c = conn.cursor()
	c.execute('select userName from friends where sourceUID=?',(uid,))
	dataList = c.fetchall()
	photosliked = []
	photoscommented = []
	userID = []
	
	photosLikedUser = []
	photosLikedCount = []
	photosCommentedUser = []
	photosCommentedCount = []
	
	for i in dataList:
		c.execute('select * from photosLiked where sourceUID=? and username=?',(uid,i[0],))
		dataList1 = []
		dataList1 = c.fetchall()
		if len(dataList1)>0:
			photosLikedUser.append(normalize(i[0]))
			photosLikedCount.append(len(dataList1))
	for i in dataList:
		c.execute('select * from photosCommented where sourceUID=? and username=?',(uid,i[0],))
		dataList1 = []
		dataList1 = c.fetchall()
		if len(dataList1)>0:	
			photosCommentedUser.append(normalize(i[0]))
			photosCommentedCount.append(len(dataList1))
	if(len(photosLikedCount)>1):	
		reportFile.write("Photo Likes: "+str(username)+" and Friends\n")
		photosLikedCount, photosLikedUser  = zip(*sorted(zip(photosLikedCount, photosLikedUser),reverse=True))	
		count=0
		while count<len(photosLikedCount):
			tmpStr = str(photosLikedCount[count])+'\t'+normalize(photosLikedUser[count])+'\n'
			count+=1
			reportFile.write(tmpStr)
	if(len(photosCommentedCount)>1):	
		reportFile.write("\n********** Comments on "+str(username)+"'s Photos **********\n")
		photosCommentedCount, photosCommentedUser  = zip(*sorted(zip(photosCommentedCount, photosCommentedUser),reverse=True))	
		count=0
		while count<len(photosCommentedCount):
			tmpStr = str(photosCommentedCount[count])+'\t'+normalize(photosCommentedUser[count])+'\n'
			count+=1
			reportFile.write(tmpStr)


	reportFile.write("\n********** Analysis of Time in Facebook **********\n")
	for timePost in timeOfPostList:
		tempList.append(timePost.split(" ")[1])
		tempTime = (timePost.split(" ")[1]).split(":")[0]
		tempTime = int(tempTime)
		if tempTime >= 21:
			timeSlot8+=1
		if tempTime >= 18 and tempTime < 21:
			timeSlot7+=1
		if tempTime >= 15 and tempTime < 18:
			timeSlot6+=1
		if tempTime >= 12 and tempTime < 15:
			timeSlot5+=1
		if tempTime >= 9 and tempTime < 12:
			timeSlot4+=1
		if tempTime >= 6 and tempTime < 9:
			timeSlot3+=1
		if tempTime >= 3 and tempTime < 6:
			timeSlot2+=1
		if tempTime >= 0 and tempTime < 3:
			timeSlot1+=1
	reportFile.write("Total % (00:00 to 03:00) "+str((timeSlot1/totalLen)*100)+" %\n")
	reportFile.write("Total % (03:00 to 06:00) "+str((timeSlot2/totalLen)*100)+" %\n")
	reportFile.write("Total % (06:00 to 09:00) "+str((timeSlot3/totalLen)*100)+" %\n")
	reportFile.write("Total % (09:00 to 12:00) "+str((timeSlot4/totalLen)*100)+" %\n")
	reportFile.write("Total % (12:00 to 15:00) "+str((timeSlot5/totalLen)*100)+" %\n")
	reportFile.write("Total % (15:00 to 18:00) "+str((timeSlot6/totalLen)*100)+" %\n")
	reportFile.write("Total % (18:00 to 21:00) "+str((timeSlot7/totalLen)*100)+" %\n")
	reportFile.write("Total % (21:00 to 24:00) "+str((timeSlot8/totalLen)*100)+" %\n")

	"""
	reportFile.write("\nDate/Time of Facebook Posts\n")
	for timePost in timeOfPostList:
		reportFile.write(timePost+'\n')	
	"""
	reportFile.close()