titles = [] for e in identifiers: e = e.split(',') for i in range(NUM_ENTITIES): try: e[i] = int(e[i]) e[i] = inflect_engine.number_to_words(e[i]) except: pass identifiers_tmp.append(e[:NUM_ENTITIES]) titles.append(','.join(e[NUM_ENTITIES:])) identifiers = identifiers_tmp #download related files downloaded_articles = [] with open(saveFile, "wb") as f: for i in range(len(titles)): tmp = query.download_articles_from_query( titles[i] + ' ' + extra_query, ' '.join(articles[i][0]), 'bing') downloaded_articles.append(tmp) pickle.dump([ articles[i], titles[i], identifiers[i], downloaded_articles[i] ], f) print '\r', i, '/', len(titles) print #save to file print "Saved to file", saveFile
# load data and process identifiers articles, identifiers = load_data(trainFile) identifiers_tmp = [] titles = [] for e in identifiers: e = e.split(",") for i in range(NUM_ENTITIES): try: e[i] = int(e[i]) e[i] = inflect_engine.number_to_words(e[i]) except: pass identifiers_tmp.append(e[:NUM_ENTITIES]) titles.append(",".join(e[NUM_ENTITIES:])) identifiers = identifiers_tmp # download related files downloaded_articles = [] with open(saveFile, "wb") as f: for i in range(len(titles)): tmp = query.download_articles_from_query(titles[i] + " " + extra_query, " ".join(articles[i][0]), "bing") downloaded_articles.append(tmp) pickle.dump([articles[i], titles[i], identifiers[i], downloaded_articles[i]], f) print "\r", i, "/", len(titles) print # save to file print "Saved to file", saveFile
continue citations = incident['citations'] for citation_ind, citation in enumerate(citations): saveFile = "../data/raw_data/"+ incident_id+"_"+str(citation_ind)+".raw" title = citation['Title'] source = citation['Source'] print "Now downloading ", incident_ind, "/", len(incidents), "pt", citation_ind if saveFile in downloaded_articles: print saveFile, "skipped" continue # try: with open(saveFile, "wb" ) as f: query_text = title+' '+ source clean_query = query_text.encode("ascii", "ignore") clean_summary= summary.encode("ascii", "ignore") articles = query.download_articles_from_query(clean_query, clean_summary,'bing') if len(articles) > 0: article = articles[0] f.write(article) f.flush() f.close() downloaded_articles[saveFile] = article pickle.dump(downloaded_articles, open('EMA_downloaded_articles_dump.p', 'wb')) else: downloaded_articles[saveFile] = "None" pickle.dump(downloaded_articles, open('EMA_downloaded_articles_dump.p', 'wb')) # except Exception, e: # downloaded_articles[saveFile] = "None" # pickle.dump(downloaded_articles, open('EMA_downloaded_articles_dump.p', 'wb')) # # raise e print "Saved to file", saveFile
for citation_ind, citation in enumerate(citations): saveFile = "../data/raw_data/" + incident_id + "_" + str( citation_ind) + ".raw" title = citation['Title'] source = citation['Source'] print "Now downloading ", incident_ind, "/", len( incidents), "pt", citation_ind if saveFile in downloaded_articles: print saveFile, "skipped" continue # try: with open(saveFile, "wb") as f: query_text = title + ' ' + source clean_query = query_text.encode("ascii", "ignore") clean_summary = summary.encode("ascii", "ignore") articles = query.download_articles_from_query( clean_query, clean_summary, 'bing') if len(articles) > 0: article = articles[0] f.write(article) f.flush() f.close() downloaded_articles[saveFile] = article pickle.dump(downloaded_articles, open('EMA_downloaded_articles_dump.p', 'wb')) else: downloaded_articles[saveFile] = "None" pickle.dump(downloaded_articles, open('EMA_downloaded_articles_dump.p', 'wb')) # except Exception, e: # downloaded_articles[saveFile] = "None" # pickle.dump(downloaded_articles, open('EMA_downloaded_articles_dump.p', 'wb'))