Ejemplo n.º 1
0
    titles = []
    for e in identifiers:
        e = e.split(',')
        for i in range(NUM_ENTITIES):
            try:
                e[i] = int(e[i])
                e[i] = inflect_engine.number_to_words(e[i])
            except:
                pass
        identifiers_tmp.append(e[:NUM_ENTITIES])
        titles.append(','.join(e[NUM_ENTITIES:]))
    identifiers = identifiers_tmp

    #download related files
    downloaded_articles = []

    with open(saveFile, "wb") as f:
        for i in range(len(titles)):
            tmp = query.download_articles_from_query(
                titles[i] + ' ' + extra_query, ' '.join(articles[i][0]),
                'bing')
            downloaded_articles.append(tmp)
            pickle.dump([
                articles[i], titles[i], identifiers[i], downloaded_articles[i]
            ], f)
            print '\r', i, '/', len(titles)
        print
    #save to file

    print "Saved to file", saveFile
    # load data and process identifiers
    articles, identifiers = load_data(trainFile)
    identifiers_tmp = []
    titles = []
    for e in identifiers:
        e = e.split(",")
        for i in range(NUM_ENTITIES):
            try:
                e[i] = int(e[i])
                e[i] = inflect_engine.number_to_words(e[i])
            except:
                pass
        identifiers_tmp.append(e[:NUM_ENTITIES])
        titles.append(",".join(e[NUM_ENTITIES:]))
    identifiers = identifiers_tmp

    # download related files
    downloaded_articles = []

    with open(saveFile, "wb") as f:
        for i in range(len(titles)):
            tmp = query.download_articles_from_query(titles[i] + " " + extra_query, " ".join(articles[i][0]), "bing")
            downloaded_articles.append(tmp)
            pickle.dump([articles[i], titles[i], identifiers[i], downloaded_articles[i]], f)
            print "\r", i, "/", len(titles)
        print
    # save to file

    print "Saved to file", saveFile
     continue
 citations = incident['citations']
 for citation_ind, citation in enumerate(citations):
     saveFile = "../data/raw_data/"+ incident_id+"_"+str(citation_ind)+".raw"
     title = citation['Title']
     source = citation['Source']
     print "Now downloading ", incident_ind, "/", len(incidents), "pt", citation_ind
     if saveFile in downloaded_articles:
         print saveFile, "skipped"
         continue
     # try:
     with open(saveFile, "wb" ) as f:
         query_text = title+' '+ source
         clean_query = query_text.encode("ascii", "ignore")
         clean_summary= summary.encode("ascii", "ignore")
         articles = query.download_articles_from_query(clean_query, clean_summary,'bing')
         if len(articles) > 0:
             article = articles[0]
             f.write(article)
             f.flush()
             f.close()
             downloaded_articles[saveFile] = article
             pickle.dump(downloaded_articles, open('EMA_downloaded_articles_dump.p', 'wb'))
         else:
             downloaded_articles[saveFile] = "None"
             pickle.dump(downloaded_articles, open('EMA_downloaded_articles_dump.p', 'wb'))
     # except Exception, e:
     #     downloaded_articles[saveFile] = "None"
     #     pickle.dump(downloaded_articles, open('EMA_downloaded_articles_dump.p', 'wb'))
     #     # raise e
     print "Saved to file", saveFile
 for citation_ind, citation in enumerate(citations):
     saveFile = "../data/raw_data/" + incident_id + "_" + str(
         citation_ind) + ".raw"
     title = citation['Title']
     source = citation['Source']
     print "Now downloading ", incident_ind, "/", len(
         incidents), "pt", citation_ind
     if saveFile in downloaded_articles:
         print saveFile, "skipped"
         continue
     # try:
     with open(saveFile, "wb") as f:
         query_text = title + ' ' + source
         clean_query = query_text.encode("ascii", "ignore")
         clean_summary = summary.encode("ascii", "ignore")
         articles = query.download_articles_from_query(
             clean_query, clean_summary, 'bing')
         if len(articles) > 0:
             article = articles[0]
             f.write(article)
             f.flush()
             f.close()
             downloaded_articles[saveFile] = article
             pickle.dump(downloaded_articles,
                         open('EMA_downloaded_articles_dump.p', 'wb'))
         else:
             downloaded_articles[saveFile] = "None"
             pickle.dump(downloaded_articles,
                         open('EMA_downloaded_articles_dump.p', 'wb'))
     # except Exception, e:
     #     downloaded_articles[saveFile] = "None"
     #     pickle.dump(downloaded_articles, open('EMA_downloaded_articles_dump.p', 'wb'))