Exemple #1
0
 def get_table_data(self):
     self.data = []
     etree = self.tree.xpath(
         '//div[@data-type="league-table-container"]/div[@data-type="table"]/div[@data-type="table-row"]'
     )
     for element in etree:
         name = sc.get_text(element.xpath('./div[@class="team"]/text()'))
         place = sc.get_text(
             element.xpath('.//span[@data-type="rank"]/text()'))
         played = sc.get_text(
             element.xpath('.//div[@data-type="played"]/text()'))
         wins = sc.get_text(
             element.xpath('.//div[@data-type="wins"]/text()'))
         draws = sc.get_text(
             element.xpath('./div[@data-type="draws"]/text()'))
         losses = sc.get_text(
             element.xpath('./div[@data-type="losses"]/text()'))
         scored = sc.get_text(
             element.xpath('./div[@data-type="goals-scored"]/text()'))
         received = sc.get_text(
             element.xpath('./div[@data-type="goals-received"]/text()'))
         difference = sc.get_text(
             element.xpath('./div[@data-type="goal-difference"]/text()'))
         points = sc.get_text(
             element.xpath('./div[@data-type="points"]/text()'))
         print(name, place, played, wins, draws, losses, scored, received,
               difference, points)
         self.data.append([
             name, place, played, wins, draws, losses, scored, received,
             difference, points
         ])
     sc.Database(('name', 'place', 'played', 'wins', 'draws', 'losses',
                  'scored', 'received', 'difference', 'points'),
                 'table.db').database(self.data)
Exemple #2
0
def collect_info(product_url):
    product_html = scraper.content(product_url)
    info = {
        'name':
        scraper.get_text(product_html, 'div#variant-info h1').pop(),
        'sku':
        scraper.get_text(product_html, 'div.commerce-product-sku span').pop(),
        'price':
        scraper.get_text(product_html, 'p.field-type-commerce-price').pop()
    }
    return info
Exemple #3
0
 def get_data(self):
     for number in range(1, 17):
         self.source = requests.get(self.url + str(number) + '/')
         self.tree = lxml.html.fromstring(self.source.content)
         self.etree = self.tree.xpath(
             '//div[@id="departement-content"]/a[@class]')
         for element in self.etree:
             self.location = sc.get_text(
                 element.xpath('./span[@class="ville"]/text()'))
             self.name = sc.get_text(
                 element.xpath('./span[@class="nom"]/text()'))
             self.description = sc.get_text(
                 element.xpath('./span[@class="intitule"]/text()'))
             self.data.append([self.location, self.name, self.description])
     sc.Database(('location', 'name', 'description')).push_data(self.data)
Exemple #4
0
 def get_data(self):
     for number in range(1, 247):
         print(number)
         self.url = self.get_url(number)
         self.source = requests.get(self.url)
         self.etree = lxml.html.fromstring(self.source.content)
         self.tree = self.etree.xpath('//div[@id="listingsResults"]//tr[not(@id)]')
         for element in self.tree:
             self.job = sc.get_href(element.xpath('.//div[@class="listing-title"]/a[@href]'))
             self.location = sc.get_text(element.xpath('.//div[@class="left-side"]/span[1]/following-sibling::span[1]/text()'))
             self.posted = sc.get_text(element.xpath('.//div[@class="left-side"]/span[2]/following-sibling::span[2]/text()'))
             self.link = sc.get_href(element.xpath('.//div[@class="left-side"]/span[3]/following-sibling::span[3]/a[@href]'))
             self.company = sc.get_text(element.xpath('.//div[@class="left-side"]/span[3]/following-sibling::span[3]//text()'))
             self.data.append([self.job, self.location, self.posted, self.link, self.company])
     sc.Database(('job', 'location', 'posted', 'link', 'company')).push_data(self.data)
Exemple #5
0
 def get_matches_data(self):
     self.data = []
     mount = int(
         self.tree.xpath(
             'count(//div[@data-type="container"]/div[@data-type="evt"])'))
     for i in range(1, mount + 1):
         element = self.tree.xpath(
             '//div[@data-type="container"]//div[@data-type="evt"][' +
             str(i) + ']')[0]
         date = sc.get_text(
             element.xpath(
                 './/preceding::div[@class="right fs11"][1]/text()'))
         time = sc.get_text(
             element.xpath('.//div[@class="min "]/span/text()'))
         home_name = sc.get_text(
             element.xpath('.//div[@class="ply tright name"]/span/text()'))
         away_name = sc.get_text(
             element.xpath('.//div[@class="ply name"]/span/text()'))
         score_home = sc.get_text(
             element.xpath('.//span[@class="hom"]/text()'))
         score_away = sc.get_text(
             element.xpath('.//span[@class="awy"]/text()'))
         print(date, time, home_name, score_home, score_away, away_name)
         self.data.append(
             [date, time, home_name, score_home, away_name, score_away])
     sc.Database(('data', 'time', 'home_name', 'score_home', 'away_name',
                  'score_away'), 'matches.db').database(self.data)
def write(Bing_Results, Google_Results, Queries, n):
    results = open('results2.txt', 'a')
    results.write('Query Kendall Jaccard B_Sub B_Pol B_pos B_neg B_Class G_Sub G_Pol G_pos G_neg G_Class\n')

    kd, jc = 0, 0
    bs, bp, bpos, bneg, bcl = 0, 0, 0, 0, 0
    gs, gp, gpos, gneg, gcl = 0, 0, 0, 0, 0

    # Missed: 13, 14, 15, 16, 17, 18, 19

    for query in Queries[42:50]:
        query = query[:-1]
        curr_kd = kendall.Kendall(Bing_Results[query], Google_Results[query])
        curr_jc = jaccard.Jaccard(Bing_Results[query], Google_Results[query])
        curr_bs = subjectivity.Subjectivity([scraper.get_text(x) for x in Bing_Results[query]])
        curr_gs = subjectivity.Subjectivity([scraper.get_text(x) for x in Google_Results[query]])
        print(query)
        kd += curr_kd
        jc += curr_jc
        bs += curr_bs[0]
        bp += curr_bs[1]
        bpos += curr_bs[2]
        bneg += curr_bs[3]
        bcl += 1 if curr_bs[4] == 'pos' else 0 
        gs += curr_gs[0]
        gp += curr_gs[1]
        gpos += curr_gs[2]
        gneg += curr_gs[3]
        gcl += 1 if curr_gs[4] == 'pos' else 0
        msg = (query + ' ' + str(round(curr_kd, 4)) + ' ' + str(round(curr_jc, 4)) +  ' ' 
                + str(round(curr_bs[0], 4)) + ' ' + str(round(curr_bs[1], 4)) +  ' ' + str(round(curr_bs[2], 4)) 
                + ' ' + str(round(curr_bs[3], 4)) + ' ' + str(curr_bs[4]) + ' '
                + str(round(curr_gs[0], 4)) + ' ' + str(round(curr_gs[1], 4)) + ' '  + str(round(curr_gs[2], 4)) 
                + ' ' + str(round(curr_gs[3], 4)) + ' ' + str(curr_gs[4]) + '\n') 
        results.write(msg)

    msg = (f'Average {round(kd/n, 4)} {round(jc/n, 4)} {round(bs/n, 4)} {round(bp/n, 4)} {round(bpos/n, 4)} {round(bneg/n, 4)}'
            + f' {round(bcl/n, 4)} {round(gs/n, 4)} {round(gp/n, 4)} {round(gpos/n, 4)} {round(gneg/n, 4)} {round(gcl/n, 4)}')
    results.write(msg)
Exemple #7
0
 def get_data(self, words):
     words = words.split('\n')
     for word in words:
         self.data[word] = []
         try:
             self.source = requests.get(self.url+word)
         except requests.exceptions.ConnectionError:
             FindWords.create_connection_error_label()
         self.tree = lxml.html.fromstring(self.source.content)
         for option in self.options:
             self.etree = self.tree.xpath(option)
             for element in self.etree:
                 self.translation = sc.get_text(element.xpath('./text()'))
                 self.data[word].append(self.translation)
Exemple #8
0
def is_url_relevant(url):
    """Check if a url is relevant
    This function will check if a url is classified as relevant and 
    if it has relevent keyword.
    """

    driver = create_driver()
    driver.get(url)

    print("Checking:", url)
    urltext = scraper.get_text(driver.page_source)
    value = classifier.predict([urltext])
    keyword_relevance = isKeywordPresent(urltext)
    relevant = value and keyword_relevance
    print("Relavent: ", relevant)

    return relevant
Exemple #9
0
 def get_data(self):
     self.etree = self.tree.xpath(
         '//table[@class="table"]//tr[following-sibling::tr]')
     self.name = sc.get_text(
         self.tree.xpath(
             '//div[@class="row bottom-margin-1x"][1]/div[1]/h1/text()[4]'))
     for element in self.etree:
         self.date = sc.get_text(element.xpath('./td[1]/text()'))
         self.open = sc.get_text(element.xpath('./td[2]/text()'))
         self.high = sc.get_text(element.xpath('./td[3]/text()'))
         self.low = sc.get_text(element.xpath('./td[4]/text()'))
         self.close = sc.get_text(element.xpath('./td[5]/text()'))
         self.volume = sc.get_text(element.xpath('./td[6]/text()'))
         self.market_cap = sc.get_text(element.xpath('./td[7]/text()'))
         self.list = [
             self.date, self.open, self.high, self.low, self.close,
             self.volume, self.market_cap
         ]
         self.data.append(self.list)
     sc.Database(
         ('date', 'open', 'high', 'low', 'close', 'volume', 'market_cap'),
         file_name=self.name + '.db').push_data(self.data)
Exemple #10
0
def calc_key_count(url):
    """Count key terms in url
    This function will count the number of key terms for each url by analyzing its web page content.
    """

    lem = WordNetLemmatizer()
    driver = create_driver()

    driver.get(url)

    keywords = keywordFetcher.fetchKeyTerms()

    urltext = scraper.get_text(driver.page_source)
    urltext = urltext.split()
    urltext = [lem.lemmatize(plural).lower() for plural in urltext]

    key_count = 0

    for i in range(len(keywords)):
        key_count += urltext.count(keywords[i])

    print(key_count)
    return key_count