def downloadArticles(incidents):
    numIncidents = len(incidents)
    delSet = []
    index = 1
    for source in incidents:
        if "title" in incidents[source]:
            index += 1
            continue
        else:
            # print incidents[source]
            print index,"/",numIncidents
            results = download_article(source, False, True)
            if results[0]:
                success, title, text, date, title2 = results
                incidents[source]["title"] = title
                incidents[source]["body"]  = text
                incidents[source]["publishDate"]  = date
            else:
                delSet.append(source)
            index += 1    
    print len(incidents)

    print "Number of keys that are have no articles", len(delSet)
    for source in delSet:
        del incidents[source]
    print len(incidents)        
def download_articles_from_query(query_text, original_text, search_engine_name):
    if search_engine_name == "google":
        article_urls = get_related_urls_from_google(query_text)
    elif search_engine_name == "bing":
        article_urls = get_related_urls_from_bing(query_text)

    article_texts = []
    article_dates = []
    downloaded_urls = []
    selected_urls = []
    i = 1
    for url in article_urls:
        print "Checking URL ", i
        i += 1
        try:
            if "newslocker" not in url and "newsjs" not in url:
                downloaded_article = download_article(url, False, False)
                article_text = downloaded_article[2]
                article_date = downloaded_article[3]
                article_title = downloaded_article[4]
                if article_text:
                    article_texts.append(article_title + " " + article_text)
                    if article_date != None:
                        article_date = article_date.replace(tzinfo=None)
                    article_dates.append(article_date)
                    downloaded_urls.append(url)
        except Exception, e:
            pass
Exemple #3
0
def download_articles_from_query(query_text, original_text,
                                 search_engine_name):
    if search_engine_name == 'google':
        article_urls = get_related_urls_from_google(query_text)
    elif search_engine_name == 'bing':
        article_urls = get_related_urls_from_bing(query_text)

    article_texts = []
    article_dates = []
    downloaded_urls = []
    selected_urls = []
    i = 1
    for url in article_urls:
        print "Checking URL ", i
        i += 1
        try:
            if "newslocker" not in url and "newsjs" not in url:
                downloaded_article = download_article(url, False, False)
                article_text = downloaded_article[2]
                article_date = downloaded_article[3]
                article_title = downloaded_article[4]
                if article_text:
                    article_texts.append(article_title + " " + article_text)
                    if article_date != None:
                        article_date = article_date.replace(tzinfo=None)
                    article_dates.append(article_date)
                    downloaded_urls.append(url)
        except Exception, e:
            pass
def test_queries(event_dict):
    """
    Meta:
    1) date, 2) shooter_name, 3) killed_num, 4) wounded_num, 5) location

    Query types:
        1. (title)
        2. "Shooting in [location]"
        3. "Shooting in [location] on [date]"
        4. "Shooting in [location] on [date], [killed_num] killed"
        5. "Shooting in [location] on [date], [killed_num] killed, [wounded_num] wounded"
        6. "Shooting in [location] on [date] by [shooter name]"
        7. "Shooting in [location] on [date] by [shooter name], [killed_num] killed"
        8. "Shooting in [location] on [date] by [shooter name], [killed_num] killed, [wounded_num] wounded"
        9.
        10.
    """
    # meta = ['[date]', '[shooter_name]', '[killed_num]', '[wounded_num]', '[location]']
    # query_types = []
    # ["Shooting in [location]", "Shooting in [location] on [date]", "Shooting in [location] on [date], [killed_num] killed",
    # "Shooting in [location] on [date], [killed_num] killed, [wounded_num] wounded", "Shooting in [location] on [date] by [shooter_name]",
    # "Shooting in [location] on [date] by [shooter_name], [killed_num] killed",
    # "Shooting in [location] on [date] by [shooter_name], [killed_num] killed, [wounded_num] wounded"]
    query_scores = {}
    query_scores_ratios = {}
    count = 0

    for metadata, urls in event_dict.items():
        if urls is None or urls == []:
            continue
        status, title, text, date, title = download_article(urls[0], False, False)
        if title is None or len(title) < 5:
            continue
        print "Event count:", count
        print "Title:", title
        print "Original URL set:", urls
        print
        # urls = set(urls)
        city = metadata[4]
        query_types_with_title = [
            " ".join([city, title]),
            " ".join([title, city]),
            " ".join(title.split()[:10]),
        ]  # query_types[:]
        query_types_with_title.insert(0, title)
        results = {}
        results_ratios = {}
        for i, query_format in enumerate(query_types_with_title):

            # query = replace_with_metadata(query_format, meta, metadata)
            query = query_format
            # article_urls_google = set(get_related_urls_from_google(query))
            article_urls_bing = get_related_urls_from_bing(query)
            print "Query used:", query
            for url in article_urls_bing:
                print url.encode("ascii", "ignore")
            # query_scores[i] = query_scores.get(i,0) + len(article_urls_google.intersection(urls))
            query_scores[i] = query_scores.get(i, 0) + count_of_originals(article_urls_bing, urls)
            query_scores_ratios[i] = query_scores_ratios.get(i, 0) + count_of_ratio_relevance(article_urls_bing, urls)
            results[i] = query_scores[i]
            results_ratios[i] = query_scores_ratios[i]
            print
        count += 1
        print results
        print results_ratios
        print
    print count
    return query_scores
Exemple #5
0
def test_queries(event_dict):
    '''
    Meta:
    1) date, 2) shooter_name, 3) killed_num, 4) wounded_num, 5) location

    Query types:
        1. (title)
        2. "Shooting in [location]"
        3. "Shooting in [location] on [date]"
        4. "Shooting in [location] on [date], [killed_num] killed"
        5. "Shooting in [location] on [date], [killed_num] killed, [wounded_num] wounded"
        6. "Shooting in [location] on [date] by [shooter name]"
        7. "Shooting in [location] on [date] by [shooter name], [killed_num] killed"
        8. "Shooting in [location] on [date] by [shooter name], [killed_num] killed, [wounded_num] wounded"
        9.
        10.
    '''
    #meta = ['[date]', '[shooter_name]', '[killed_num]', '[wounded_num]', '[location]']
    #query_types = []
    #["Shooting in [location]", "Shooting in [location] on [date]", "Shooting in [location] on [date], [killed_num] killed",
    #"Shooting in [location] on [date], [killed_num] killed, [wounded_num] wounded", "Shooting in [location] on [date] by [shooter_name]",
    #"Shooting in [location] on [date] by [shooter_name], [killed_num] killed",
    #"Shooting in [location] on [date] by [shooter_name], [killed_num] killed, [wounded_num] wounded"]
    query_scores = {}
    query_scores_ratios = {}
    count = 0

    for metadata, urls in event_dict.items():
        if urls is None or urls == []:
            continue
        status, title, text, date, title = download_article(
            urls[0], False, False)
        if title is None or len(title) < 5:
            continue
        print "Event count:", count
        print "Title:", title
        print "Original URL set:", urls
        print
        #urls = set(urls)
        city = metadata[4]
        query_types_with_title = [
            " ".join([city, title]), " ".join([title, city]),
            " ".join(title.split()[:10])
        ]  #query_types[:]
        query_types_with_title.insert(0, title)
        results = {}
        results_ratios = {}
        for i, query_format in enumerate(query_types_with_title):

            #query = replace_with_metadata(query_format, meta, metadata)
            query = query_format
            #article_urls_google = set(get_related_urls_from_google(query))
            article_urls_bing = get_related_urls_from_bing(query)
            print "Query used:", query
            for url in article_urls_bing:
                print url.encode("ascii", "ignore")
            #query_scores[i] = query_scores.get(i,0) + len(article_urls_google.intersection(urls))
            query_scores[i] = query_scores.get(i, 0) + count_of_originals(
                article_urls_bing, urls)
            query_scores_ratios[i] = query_scores_ratios.get(
                i, 0) + count_of_ratio_relevance(article_urls_bing, urls)
            results[i] = query_scores[i]
            results_ratios[i] = query_scores_ratios[i]
            print
        count += 1
        print results
        print results_ratios
        print
    print count
    return query_scores