def calculate_concat(question_text, answer):
    query_url = define_url(question_text, answer.get_text())
    google_results, full_page = search(query_url, full_page=True)
    # Extract the number of total google results
    answer.total_results = get_google_total_results(full_page)

    for result in google_results:
        result_text = PyQuery(result).text()
        # If Google doesn't find enough results, it includes some that aren't really relevant,
        # adding "Missing words: <keywords>", where keywords are words
        # included in the search query (answer, here).
        # In this context, the results described are not useful and are excluded

        # If the answer is in the result text
        # If "Mancanti:" is not in the result text (so, it's a relevant result)
        # If the answer is not in the "Must include" section
        if answer.get_text() in result_text.lower() and \
            not "Mancanti:" in result_text and \
                not answer.get_text() in result_text.split("\n")[-1].lower():
            # Yay! This is a relevant result!
            answer.results += 1

    # Calculate the score of the answer
    answer.score = answer.total_results * (answer.results
                                           if answer.results > 0 else 1)

    return f"{answer.get_text()[:40]:^40}{answer.score:<10}{answer.results:^10}{answer.total_results:<10}"
Example #2
0
 def __init__(self, elem, trims, should_cleanup):
     text = PyQuery(elem).text()
     for trim in (trims or []):
         text = text.replace(trim, '')
     self.rx = re.compile(r'\W+')
     self.text = text.strip()
     self.trimmed_text = non_trimmed.sub(' ', self.text)
     self.html = PyQuery(elem).html()
     if should_cleanup:
         self.html = self.cleanup_html()
     self.normalized_text = nonword.sub('', text.lower())
def process(url, handle):
    source = PyQuery(url=url)
    siteList = source.find("li.site-listing").find("a")
    content = ""
    for data in siteList:
        domain = PyQuery(data).text()
        if domain.lower() == "more":
            continue
        else:
            #print domain
            content += domain + "\n"
    handle.write(content)
def process(url, handle):
    source = PyQuery(url=url)
    siteList = source.find("li.site-listing").find("a")
    content = ""
    for data in siteList:
        domain = PyQuery(data).text()
        if domain.lower() == "more":
            continue
        else:
            #print domain
            content += domain + "\n"
    handle.write(content)
Example #5
0
def fetch_events():
    # some constants for this scrape
    playerID = CONFIG['playerID']
    team = CONFIG['team']
    # use Honolulu time zone so we scrape the right date
    # even during weird baseball
    today = arrow.now('Pacific/Honolulu').date()
    
    r = create_redis_connection()
        
    # scrape the MLB game list page
    game_day_url = DATA_ROOT + 'year_{0}/month_{1}/day_{2}/'.format(today.year, '{:02d}'.format(today.month), '{:02d}'.format(today.day))
    page = PQ(game_day_url)
    # find the links on page
    game_links = [PQ(link).attr('href') for link in page('li a')]
    # we only care about game data links for player's team
    game_links = [link.strip('/') for link in game_links if 'gid' in link and team.lower() in link]

    # iterate through team's games for the day
    for gameID in game_links:
        # get the player's batter data file for this game
        data_url = game_day_url + gameID
        data_url += "/batters/{0}.xml".format(playerID)
        page = PQ(data_url)
        # just the at-bat events please
        atbats = page('atbats ab')

        # iterate through player's at-bats
        for index, event in enumerate(atbats):
            atbat = index+1
            # see if we've seen this at-bat
            rkey = "{0}-{1}-AB{2}".format(gameID, playerID, atbat)
            stored = r.get(rkey)

            # store results of new at-bats so we only
            # match against events we haven't seen
            if not stored:
                result = PQ(event).attr("event")
                match = result.lower() in [event.lower() for event in CONFIG['events_tracked']]
                r.set(rkey, result)
                
                # if we match, do a thing
                if match:
                    handle_match(result)
                else:
                    handle_miss(result)
    print 'Done with scrape.' 
Example #6
0
def parse_html(filename, pixel='720', sub='', download='magnet'):
    with open(filename, encoding='utf-8') as f:
        html_raw = f.read()
    html = PQ(html_raw)
    tr_all = html.find('#seedlist').find('tr')
    tr_selected = []
    a_selected = []
    for tr in tr_all:
        tr = PQ(tr)
        title = PQ(tr.find('a')[0]).text()
        if pixel in title and sub.lower() in title.lower():
            tr_selected.append(title)
            for a in tr.find('a'):
                href = a.attrib['href']
                if download in href:
                    a_selected.append(href)
                    break
    print('total:', len(tr_selected), 'find:', len(a_selected))
    # pprint(tr_selected)
    print('\n'.join(a_selected))