def setUp(self): self.db_manager = mongo_manager.MongoManager(configuration.db_name) self.db_manager_test = mongo_manager.MongoManager( configuration.db_name_test) self.db_manager_test.delete_many("tweets", {}) self.db_manager_test.delete_many("seeds", {}) self.num_tweet_for_test = 20 # Extract n random tweet from list if len(list(self.db_manager_test.find("tweets", {}))) == 0: logging.info("Crawl exemples seeds...") crawler_twitter = CrawlerTwitter("12345") seeds = ["EleonoraM_37"] new_seeds = crawler_twitter.run(100, seeds) crawler_twitter.run(100, new_seeds) crawler_twitter.storeSeeds(seeds) tweets_test = random.sample( list(self.db_manager.find("tweets", {"id_experiment": "12345"})), self.num_tweet_for_test) print(tweets_test) self.db_manager_test.write_mongo("tweets", tweets_test) for t in tweets_test: pprint.pprint(t["user"]["screen_name"]) seed_test = list( self.db_manager.find("seeds", { "handle": t["user"]["screen_name"], "id_experiment": "12345" })) self.db_manager_test.write_mongo("seeds", seed_test) configuration.db_name = configuration.db_name_test
def __init__(self, id_experiment): self.id_experiment = id_experiment self.db_manager = mongo_manager.MongoManager(configuration.db_name) self.db_manager.create_index("seed_candidates", [("id_candidate", DESCENDING), ("id_experiment", DESCENDING)]) tweets = list( self.db_manager.find("tweets", {"id_experiment": id_experiment})) self.run(tweets)
def run_crawler_four_keys(self, id_experiment): self.id_experiment = id_experiment # Documentation: http://python-dandelion-eu.readthedocs.io/en/latest/datatxt.html#nex-named-entity-extraction self.db_manager = mongo_manager.MongoManager(configuration.db_name) languages = ("de", "en", "es", "fr", "it", "pt") all_tweets = list( self.db_manager.find("tweets", {"id_experiment": id_experiment})) chunks_for_each_key = math.ceil(len(all_tweets) / 4) tweets_each_request = math.ceil(chunks_for_each_key / configuration.NUMBER_REQUEST_DANDELION) print(len(all_tweets), chunks_for_each_key, tweets_each_request) # Retrieve all tweets languages_chunks = [] for l in languages: tweets = list( self.db_manager.find("tweets", { "lang": l, "id_experiment": id_experiment })) if (len(tweets) == 0): continue # print(l,len(tweets), len(tweets)%tweets_each_request) mod_tweets = tuple([ tweets.pop() for i in range(0, len(tweets) % tweets_each_request) ]) # print(l,mod_tweets,len(tweets), len(tweets)%tweets_each_request) tweets_chunks = list(zip(*[iter(tweets)] * tweets_each_request)) # print(len(tweets_chunks)) if mod_tweets != (): tweets_chunks.append(mod_tweets) # print(len(tweets_chunks)) languages_chunks.extend(tweets_chunks) self.split_tweets_and_run(languages_chunks)
def __init__(self): self.db_manager = mongo_manager.MongoManager(configuration.db_name) seeds = list(self.db_manager.find("seeds", {"starting": True})) seed_ids = [s["_id"] for s in seeds] candidates = list( self.db_manager.find("rank_candidates", {}).sort("ranking_index", pymongo.DESCENDING))[:250] cand_ids = [c["_id"] for c in candidates if (c["ranking_index"] != 0)] self.seed_candidates_to_csv(seeds, "../data/Out_csv/seed.csv") self.seed_candidates_to_csv(candidates, "../data/Out_csv/cand.csv") entities_lf_seed = list( self.db_manager.find("entity_lf", {"seed": { "$in": seed_ids }})) self.save_entities_lf(entities_lf_seed, "../data/Out_csv/seed_lfentities.csv") entities_lf_cand = list( self.db_manager.find("entity_lf", {"seed": { "$in": cand_ids }})) self.save_entities_lf(entities_lf_cand, "../data/Out_csv/cand_lfentities.csv") entities_seed = list( self.db_manager.find("entity", {"seed": { "$in": seed_ids }})) self.save_entities(entities_seed, "../data/Out_csv/seed_entities.csv") entities_cand = list( self.db_manager.find("entity", {"seed": { "$in": cand_ids }})) self.save_entities(entities_cand, "../data/Out_csv/cand_entities.csv")
def setUp(self): # Retrieve all tweets tweets = list(mongo_manager.MongoManager(configuration.db_name).find("tweets", {}))[10:16] self.datatxt = DataTXT(app_id=configuration.APP1_ID, app_key=configuration.API_KEY_DANDELION1) self.t = tweets_chunk.TweetsChunk(tweets)
configuration.API_KEY_DANDELION1 = sys.argv[1] configuration.APP1_ID = sys.argv[2] configuration.API_KEY_DANDELION2 = sys.argv[3] configuration.APP2_ID = sys.argv[4] configuration.API_KEY_DANDELION3 = sys.argv[5] configuration.APP3_ID = sys.argv[6] configuration.API_KEY_DANDELION4 = sys.argv[7] configuration.APP4_ID = sys.argv[8] # Twitter API configuration.access_token = sys.argv[9] configuration.access_token_secret = sys.argv[10] configuration.consumer_key = sys.argv[11] configuration.consumer_secret = sys.argv[12] db_manager = mongo_manager.MongoManager(configuration.db_name) def run_scenario(scenario): diction = {"email": "marco.tagliabue@" + scenario + ".com"} diction["status"] = "processing" seeds_dataframe = pd.read_csv("../data/In_csv/" + scenario + "/seed.csv") seeds = seeds_dataframe.ix[:, 1].tolist() expert_dataframe = pd.read_csv("../data/In_csv/" + scenario + "/expert_types.csv") experts = expert_dataframe.ix[:, 0].tolist() diction["expert_types"] = experts id_experiment = db_manager.write_mongo("experiment", diction)
def setUp(self): self.db_manager_test = mongo_manager.MongoManager( configuration.db_name_test) self.db_manager_test.delete_many("tweets", {}) self.db_manager_test.write_mongo( "tweets", { "entities": { "user_mentions": [{ "screen_name": "marco" }, { "screen_name": "luca" }, { "screen_name": "andrea" }] } }) self.db_manager_test.write_mongo( "tweets", {"entities": { "user_mentions": [{ "screen_name": "marco" }] }}) self.db_manager_test.write_mongo( "tweets", { "entities": { "user_mentions": [{ "screen_name": "marco" }, { "screen_name": "andrea" }] } }) self.db_manager_test.write_mongo( "tweets", { "entities": { "user_mentions": [{ "screen_name": "marco" }, { "screen_name": "luca" }] } }) self.db_manager_test.write_mongo( "tweets", { "entities": { "user_mentions": [{ "screen_name": "luca" }, { "screen_name": "andrea" }] } }) self.db_manager_test.write_mongo( "tweets", { "entities": { "user_mentions": [{ "screen_name": "marco" }, { "screen_name": "franco" }] } }) self.seeds = ["luca", "andrea", "franco"] configuration.db_name = configuration.db_name_test self.e = ExtractCandidates()