def setUp(self):
        self.db_manager = mongo_manager.MongoManager(configuration.db_name)
        self.db_manager_test = mongo_manager.MongoManager(
            configuration.db_name_test)
        self.db_manager_test.delete_many("tweets", {})
        self.db_manager_test.delete_many("seeds", {})

        self.num_tweet_for_test = 20

        # Extract n random tweet from list
        if len(list(self.db_manager_test.find("tweets", {}))) == 0:
            logging.info("Crawl exemples seeds...")
            crawler_twitter = CrawlerTwitter("12345")
            seeds = ["EleonoraM_37"]
            new_seeds = crawler_twitter.run(100, seeds)
            crawler_twitter.run(100, new_seeds)
            crawler_twitter.storeSeeds(seeds)

        tweets_test = random.sample(
            list(self.db_manager.find("tweets", {"id_experiment": "12345"})),
            self.num_tweet_for_test)
        print(tweets_test)
        self.db_manager_test.write_mongo("tweets", tweets_test)
        for t in tweets_test:
            pprint.pprint(t["user"]["screen_name"])
            seed_test = list(
                self.db_manager.find("seeds", {
                    "handle": t["user"]["screen_name"],
                    "id_experiment": "12345"
                }))
            self.db_manager_test.write_mongo("seeds", seed_test)

        configuration.db_name = configuration.db_name_test
    def __init__(self, id_experiment):

        self.id_experiment = id_experiment
        self.db_manager = mongo_manager.MongoManager(configuration.db_name)
        self.db_manager.create_index("seed_candidates",
                                     [("id_candidate", DESCENDING),
                                      ("id_experiment", DESCENDING)])

        tweets = list(
            self.db_manager.find("tweets", {"id_experiment": id_experiment}))
        self.run(tweets)
    def run_crawler_four_keys(self, id_experiment):
        self.id_experiment = id_experiment

        # Documentation: http://python-dandelion-eu.readthedocs.io/en/latest/datatxt.html#nex-named-entity-extraction
        self.db_manager = mongo_manager.MongoManager(configuration.db_name)
        languages = ("de", "en", "es", "fr", "it", "pt")

        all_tweets = list(
            self.db_manager.find("tweets", {"id_experiment": id_experiment}))
        chunks_for_each_key = math.ceil(len(all_tweets) / 4)
        tweets_each_request = math.ceil(chunks_for_each_key /
                                        configuration.NUMBER_REQUEST_DANDELION)

        print(len(all_tweets), chunks_for_each_key, tweets_each_request)

        # Retrieve all tweets
        languages_chunks = []
        for l in languages:
            tweets = list(
                self.db_manager.find("tweets", {
                    "lang": l,
                    "id_experiment": id_experiment
                }))
            if (len(tweets) == 0):
                continue
            # print(l,len(tweets), len(tweets)%tweets_each_request)
            mod_tweets = tuple([
                tweets.pop() for i in range(0,
                                            len(tweets) % tweets_each_request)
            ])
            # print(l,mod_tweets,len(tweets), len(tweets)%tweets_each_request)

            tweets_chunks = list(zip(*[iter(tweets)] * tweets_each_request))
            # print(len(tweets_chunks))
            if mod_tweets != ():
                tweets_chunks.append(mod_tweets)
            # print(len(tweets_chunks))

            languages_chunks.extend(tweets_chunks)

        self.split_tweets_and_run(languages_chunks)
    def __init__(self):
        self.db_manager = mongo_manager.MongoManager(configuration.db_name)

        seeds = list(self.db_manager.find("seeds", {"starting": True}))
        seed_ids = [s["_id"] for s in seeds]
        candidates = list(
            self.db_manager.find("rank_candidates",
                                 {}).sort("ranking_index",
                                          pymongo.DESCENDING))[:250]
        cand_ids = [c["_id"] for c in candidates if (c["ranking_index"] != 0)]

        self.seed_candidates_to_csv(seeds, "../data/Out_csv/seed.csv")
        self.seed_candidates_to_csv(candidates, "../data/Out_csv/cand.csv")

        entities_lf_seed = list(
            self.db_manager.find("entity_lf", {"seed": {
                "$in": seed_ids
            }}))
        self.save_entities_lf(entities_lf_seed,
                              "../data/Out_csv/seed_lfentities.csv")
        entities_lf_cand = list(
            self.db_manager.find("entity_lf", {"seed": {
                "$in": cand_ids
            }}))
        self.save_entities_lf(entities_lf_cand,
                              "../data/Out_csv/cand_lfentities.csv")

        entities_seed = list(
            self.db_manager.find("entity", {"seed": {
                "$in": seed_ids
            }}))
        self.save_entities(entities_seed, "../data/Out_csv/seed_entities.csv")
        entities_cand = list(
            self.db_manager.find("entity", {"seed": {
                "$in": cand_ids
            }}))
        self.save_entities(entities_cand, "../data/Out_csv/cand_entities.csv")
Beispiel #5
0
 def setUp(self):
     # Retrieve all tweets
     tweets = list(mongo_manager.MongoManager(configuration.db_name).find("tweets", {}))[10:16]
     self.datatxt = DataTXT(app_id=configuration.APP1_ID, app_key=configuration.API_KEY_DANDELION1)
     self.t = tweets_chunk.TweetsChunk(tweets)
configuration.API_KEY_DANDELION1 = sys.argv[1]
configuration.APP1_ID = sys.argv[2]
configuration.API_KEY_DANDELION2 = sys.argv[3]
configuration.APP2_ID = sys.argv[4]
configuration.API_KEY_DANDELION3 = sys.argv[5]
configuration.APP3_ID = sys.argv[6]
configuration.API_KEY_DANDELION4 = sys.argv[7]
configuration.APP4_ID = sys.argv[8]

# Twitter API
configuration.access_token = sys.argv[9]
configuration.access_token_secret = sys.argv[10]
configuration.consumer_key = sys.argv[11]
configuration.consumer_secret = sys.argv[12]

db_manager = mongo_manager.MongoManager(configuration.db_name)


def run_scenario(scenario):
    diction = {"email": "marco.tagliabue@" + scenario + ".com"}
    diction["status"] = "processing"

    seeds_dataframe = pd.read_csv("../data/In_csv/" + scenario + "/seed.csv")
    seeds = seeds_dataframe.ix[:, 1].tolist()

    expert_dataframe = pd.read_csv("../data/In_csv/" + scenario +
                                   "/expert_types.csv")
    experts = expert_dataframe.ix[:, 0].tolist()
    diction["expert_types"] = experts

    id_experiment = db_manager.write_mongo("experiment", diction)
    def setUp(self):
        self.db_manager_test = mongo_manager.MongoManager(
            configuration.db_name_test)
        self.db_manager_test.delete_many("tweets", {})
        self.db_manager_test.write_mongo(
            "tweets", {
                "entities": {
                    "user_mentions": [{
                        "screen_name": "marco"
                    }, {
                        "screen_name": "luca"
                    }, {
                        "screen_name": "andrea"
                    }]
                }
            })
        self.db_manager_test.write_mongo(
            "tweets",
            {"entities": {
                "user_mentions": [{
                    "screen_name": "marco"
                }]
            }})
        self.db_manager_test.write_mongo(
            "tweets", {
                "entities": {
                    "user_mentions": [{
                        "screen_name": "marco"
                    }, {
                        "screen_name": "andrea"
                    }]
                }
            })
        self.db_manager_test.write_mongo(
            "tweets", {
                "entities": {
                    "user_mentions": [{
                        "screen_name": "marco"
                    }, {
                        "screen_name": "luca"
                    }]
                }
            })
        self.db_manager_test.write_mongo(
            "tweets", {
                "entities": {
                    "user_mentions": [{
                        "screen_name": "luca"
                    }, {
                        "screen_name": "andrea"
                    }]
                }
            })
        self.db_manager_test.write_mongo(
            "tweets", {
                "entities": {
                    "user_mentions": [{
                        "screen_name": "marco"
                    }, {
                        "screen_name": "franco"
                    }]
                }
            })

        self.seeds = ["luca", "andrea", "franco"]
        configuration.db_name = configuration.db_name_test
        self.e = ExtractCandidates()