Ejemplo n.º 1
0
def get_data(fields_to_retrieve,
             collection,
             config_fn,
             dataset_filename,
             filter_query=None):
    if dataset_filename and os.path.isfile(dataset_filename):
        df = pd.read_csv(dataset_filename)
    else:
        config_fn = 'config_mongo_inb.json'
        collection = 'rc_all'
        dbm = DBManager(collection=collection, config_fn=config_fn)
        if not filter_query:
            filter_query = {}
        data = dbm.get_tweets_reduced(filter_query, fields_to_retrieve)
        df = pd.DataFrame(data)
        data = None  # free some memory
        df.to_csv(os.path.join(dataset_filename), index=False)

    return df
Ejemplo n.º 2
0
def setup_database_data():
    db_obj = DBManager(databases['books'])

    cleanup_query = [
        "delete from author_book", "delete from book", "delete from author"
    ]
    for query in cleanup_query:
        db_obj.processquery(query=query)

    setup_query = [
        "INSERT INTO `author` VALUES (2,'John Doe','2019-06-28 18:50:11'),(3,'Martin','2019-06-28 19:33:14'),(4,'Jeo','2019-06-29 00:37:02'),(5,'kelin','2019-06-29 01:37:16')",
        "INSERT INTO `book` VALUES (10,'Jungle book','123456',400,'No Books','UK States','2019-01-20','2019-06-29 01:24:16','2019-06-29 01:50:13')",
        "INSERT INTO `author_book` VALUES (14,10,2),(15,10,3),(16,10,4),(17,10,5)"
    ]
    for query in setup_query:
        db_obj.processquery(query=query)

    db_obj.commit()
    db_obj.close()
Ejemplo n.º 3
0
        def get_mids_by_surface():
            surface = request.args.get('surface').strip()
            print '[get_mid_by_surface]'

            mids = DBManager.get_candidate_entities(surface, 0.1)
            print mids
            res = {
                'candidates': '<br>'.join('%s %s' % (m[0], m[1]) for m in mids)
            }
            return json.dumps(res)
Ejemplo n.º 4
0
def export_sentiment_scores_from_ids(file_tweet_ids, collection, config_fn):
    dbm = DBManager(collection=collection, config_fn=config_fn)
    tweets_to_export = []
    with open(file_tweet_ids, 'r') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        for row in csv_reader:
            tweet_id = row['id']
            print('Processing tweet: {}'.format(tweet_id))
            tweet_obj = dbm.find_record({'id_str': str(tweet_id)})
            if tweet_obj is None:
                print('Missing tweet...')
                continue
            tweet_to_export = {
                'id': tweet_id,
                'text': tweet_obj['complete_text']
            }
            sentiment_obj = tweet_obj['sentiment']
            if 'sentiment_score_polyglot' in sentiment_obj:
                tweet_to_export['score_polyglot'] = \
                    sentiment_obj['sentiment_score_polyglot']
            if 'sentiment_score_sentipy' in sentiment_obj:
                tweet_to_export['score_sentipy'] = \
                    sentiment_obj['sentiment_score_sentipy']
            if 'sentiment_score_affin' in sentiment_obj:
                tweet_to_export['score_affin'] = \
                    sentiment_obj['sentiment_score_affin']
            if 'sentiment_score_vader' in sentiment_obj:
                tweet_to_export['score_vader'] = \
                    sentiment_obj['sentiment_score_vader']
            tweet_to_export['sentiment_score'] = sentiment_obj['score']
            tweet_to_export['human_label'] = row['label']
            tweets_to_export.append(tweet_to_export)
    output_file = '../data/bsc/processing_outputs/sentiment_scores_from_ids.csv'
    print('Saving tweets to the CSV {}'.format(output_file))
    with open(output_file, 'w') as csv_file:
        headers = tweets_to_export[0].keys()
        csv_writer = csv.DictWriter(csv_file, fieldnames=headers)
        csv_writer.writeheader()
        for tweet_to_export in tweets_to_export:
            csv_writer.writerow(tweet_to_export)
Ejemplo n.º 5
0
def upload_tweet_sentiment():
    print('Process started, it can take several time. Follow updates through ' \
          'the log...')
    dbm_local = DBManager(collection='tweets_esp_hpai')
    dbm_remote = DBManager(collection='bsc-ls',
                           config_fn='config_mongo_hpai.json')
    query = {'$and': [{'lang': {'$in': SPAIN_LANGUAGES}},
                      {'$or': [{'place.country': 'Spain'},
                               {'user.location': {'$in': \
                                   get_spain_places_regex()}}]}]}
    query.update({'retweeted_status': {'$exists': 0}})
    query.update({'sentiment_score': {'$exists': 1}})
    tweets = dbm_local.search(query)
    total_tweets = tweets.count()
    processing_counter = total_segs = modified_records = found_tweets = 0
    logging.info('Going to upload {0:,} tweets'.format(total_tweets))
    for tweet in tweets:
        start_time = time.time()
        processing_counter += 1
        logging.info('[{0}/{1}] Processing tweet:\n{2}'.\
                     format(processing_counter, total_tweets, tweet['id']))
        sentiment_dict = {
            'sentiment': {
                'score':
                tweet['sentiment_score']
                if tweet['sentiment_score_polyglot'] else None,
            }
        }
        if tweet['sentiment_score_polyglot']:
            sentiment_dict['sentiment']['raw_score_polyglot'] = \
                tweet['sentiment_score_polyglot']
        if 'sentiment_score_sentipy' in tweet:
            sentiment_dict['sentiment']['raw_score_sentipy'] = \
                tweet['sentiment_score_sentipy']
        if 'sentiment_score_affin' in tweet:
            sentiment_dict['sentiment']['raw_score_affin'] = \
                tweet['sentiment_score_affin']
        ret_update = dbm_remote.update_record({'id': int(tweet['id'])},
                                              sentiment_dict)
        if ret_update.matched_count == 0:
            logging.info('Could not find in the remote server a tweet with ' \
                         'the id {}'.format(tweet['id']))
        elif ret_update.matched_count == 1:
            found_tweets += 1
            if ret_update.modified_count == 0:
                logging.info('Found tweet but did not update.')
            elif ret_update.modified_count == 1:
                modified_records += 1
                logging.info('Remote tweet update with sentiment info!')
        total_segs += calculate_remaining_execution_time(
            start_time, total_segs, processing_counter, total_tweets)
    logging.info('Total processed tweets: {0:,}\n'\
                 'Total found tweets in remote server: {1:,}\n'
                 'Total updated tweets in remote server: {2:,}\n'.\
                 format(total_tweets, found_tweets, modified_records))
    print('Process finished!')
Ejemplo n.º 6
0
def export_user_sample(sample_size,
                       collection,
                       config_file=None,
                       output_filename=None):
    project_dir = pathlib.Path(__file__).parents[1].resolve()
    if not output_filename:
        output_filename = 'user_sample.jsonl'
    output = os.path.join(project_dir, 'data', output_filename)
    dbm = DBManager(collection=collection, config_fn=config_file)
    query_filter = {'lang': 'es'}
    projection = {'_id': 0, 'user': 1}
    logging.info('Getting sample of users, please wait...')
    tweets = dbm.get_sample(int(sample_size), query_filter, projection)
    total_tweets = len(tweets)
    logging.info('Found {} users'.format(total_tweets))
    saved_tweets = 0
    with open(output, 'w') as f:
        for i in range(total_tweets):
            user_obj = tweets[i]['user']
            if exists_user(user_obj):
                saved_tweets += 1
                logging.info('[{0}] Saving user: {1}'.format(
                    saved_tweets, user_obj['screen_name']))
                f.write("{}\n".format(json.dumps(user_obj)))
Ejemplo n.º 7
0
def do_collection_merging(master_collection,
                          collections_to_merge,
                          config_fn=None):
    dbm_master = DBManager(collection=master_collection, config_fn=config_fn)
    for collection in collections_to_merge:
        logging.info('Merging collection {0} into {1}'.format(
            collection, master_collection))
        dbm_collection_to_merge = DBManager(collection=collection,
                                            config_fn=config_fn)
        tweets = dbm_collection_to_merge.find_all()
        logging.info('Trying to insert {0:,} tweets'.format(tweets.count()))
        try:
            ret_insertions = dbm_master.insert_many_tweets(tweets,
                                                           ordered=False)
            insertion_counter = ret_insertions.inserted_ids
            logging.info('{0:,} new tweets were inserted into the collection {1}'.\
                         format(insertion_counter, master_collection))
        except Exception as e:
            logging.error('Error when merging {}'.format(e))
Ejemplo n.º 8
0
    def __init__(self):

        self.db_obj = DBManager(databases['books'])
        self.dao_obj = BooksDAO(self.db_obj)
Ejemplo n.º 9
0
class InternalBooks(object):
    """
        This is the class for manager Internal book store
    """
    def __init__(self):

        self.db_obj = DBManager(databases['books'])
        self.dao_obj = BooksDAO(self.db_obj)

    def __get_books_by_filter(self, key, value):
        """
        :param book_id:
        :return:
        """
        search_func = {
            "all": self.dao_obj.get_all_books,
            "id": self.dao_obj.get_books_by_id,
            "name": self.dao_obj.get_books_by_name,
            "publisher": self.dao_obj.get_books_by_publisher,
            "country": self.dao_obj.get_books_by_country,
            "release date": self.dao_obj.get_books_by_release_date,
        }

        result = search_func[key](value)

        for row in result:

            res_author = self.dao_obj.get_author_by_book_id(row['id'])
            author_list = [val['name'] for val in res_author]
            row.update({"authors": author_list})
            row["release_date"] = str(row["release_date"])

        return result

    @staticmethod
    def format_get_response(book_info):
        """
        :param book_info:
        :return:
        """
        response = {"status_code": 200, "status": "success", "data": []}
        if book_info:
            response.update({"data": book_info})

        return response

    def get_books(self, key, value):
        """
        :param key:
        :param value:
        :return:
        """
        try:
            logger.info("Entering get_books")
            result = self.__get_books_by_filter(key, value)
            if result and key == "id":
                result = result[0]
            logger.info("Exiting get_books")
            return self.format_get_response(result)

        except Exception as err:
            logger.exception(err)
            raise
        finally:
            self.db_obj.close()

    @staticmethod
    def format_insert_response(book_info):
        """
        :return:
        """
        response = {"status_code": 201, "status": "success", "data": []}

        book_info.pop("id")
        response["data"].append({"book": book_info})
        return response

    def __link_author_book(self, book_id, author_list):
        """
        :param book_id:
        :param author_list:
        :return:
        """

        for row in author_list:

            res_author = self.dao_obj.get_author(row)

            if res_author:
                author_id = res_author["author_id"]
            else:
                author_id = self.dao_obj.insert_into_author(row)

            self.dao_obj.insert_into_author_book(author_id, book_id)

    def insert_book(self, new_book):
        """
        This method used to insert new book
        :return:
        """
        try:
            logger.info("Entering insert_book")

            book_info = self.__get_books_by_filter("name", new_book["name"])

            if book_info:
                return self.format_insert_response(book_info[0])

            book_entity_obj = Book(new_book)
            book_id = self.dao_obj.insert_into_book(book_entity_obj)

            self.__link_author_book(book_id, book_entity_obj.authors)

            book_info = self.__get_books_by_filter("id", book_id)
            response = self.format_insert_response(book_info[0])

            self.db_obj.commit()

            logger.info("Exiting insert_book")
            return response

        except Exception as err:
            logger.exception(err)
            self.db_obj.rollback()
            raise
        finally:
            self.db_obj.close()

    @staticmethod
    def format_update_reponse(book_info):
        """
        :param book_info:
        :return:
        """
        response = {
            "status_code":
            200,
            "status":
            "success",
            "message":
            "The book {} was updated successfully".format(book_info["name"])
        }

        response.update({"data": book_info})

        return response

    def patch_book(self, book_id, patch_book):
        """

        :return:
        """
        try:

            logger.info("Entering patch_book")
            book_info = self.__get_books_by_filter("id", book_id)
            if not book_info:
                raise ResourceNotAvailable(
                    "Given resource {} is not available".format(book_id))

            book_entity = Book(book_info[0])
            author_to_update = False
            for key, value in patch_book.items():
                setattr(book_entity, key, value)
                if key == "authors":
                    author_to_update = True

            self.dao_obj.update_into_book(book_id, book_entity)

            if author_to_update:
                self.dao_obj.delete_author_book(book_id)
                self.__link_author_book(book_id, patch_book["authors"])

            book_info = self.__get_books_by_filter("id", book_id)
            response = self.format_update_reponse(book_info[0])

            self.db_obj.commit()
            logger.info("Exiting patch_book")
            return response

        except Exception as err:
            self.db_obj.rollback()
            logger.exception(err)
            raise
        finally:
            self.db_obj.close()

    @staticmethod
    def format_delete_reponse(book_name):
        """
        :param book_name:
        :return:
        """

        response = {
            "status_code": 200,
            "status": "success",
            "message":
            "The book {} was deleted successfully".format(book_name),
            "data": []
        }
        return response

    def delete_book(self, book_id):
        """
        :param book_id:
        :return:
        """
        try:

            logger.info("Entering delete_book")
            book_info = self.__get_books_by_filter("id", book_id)
            if not book_info:
                raise ResourceNotAvailable(
                    "Given resource {} is not available".format(book_id))

            self.dao_obj.delete_author_book(book_id)
            self.dao_obj.delete_from_book(book_id)

            response = self.format_delete_reponse(book_info[0]["name"])

            self.db_obj.commit()
            logger.info("Exiting delete_book")
            return response

        except Exception as err:
            self.db_obj.rollback()
            logger.exception(err)
            raise
        finally:
            self.db_obj.close()
Ejemplo n.º 10
0
        def get_subgraph():
            mid = request.args.get('mid').strip()
            print '[get_subgraph]', mid
            subgraph = self.freebase.client.get_subgraph(mid)
            print "subgraph", subgraph
            links = []
            nodes_ = {}

            for path in subgraph:
                if len(path) == 1:
                    p1 = path[0]
                    if p1[0] not in nodes_:
                        nodes_[p1[0]] = {
                            'category': 0,
                            'name': p1[0],
                            'value': 10
                        }

                    if p1[2] not in nodes_:
                        nodes_[p1[2]] = {
                            'category': 2,
                            'name': p1[2],
                            'value': 4
                        }
                else:
                    p1 = path[0]
                    if p1[0] not in nodes_:
                        nodes_[p1[0]] = {
                            'category': 0,
                            'name': p1[0],
                            'value': 10
                        }
                    if p1[2] not in nodes_:
                        nodes_[p1[2]] = {
                            'category': 1,
                            'name': p1[2],
                            'value': 4
                        }
                    p2 = path[1]
                    if p2[2] not in nodes_:
                        nodes_[p2[2]] = {
                            'category': 2,
                            'name': p2[2],
                            'value': 4
                        }

            for m in nodes_.keys():
                name, name_info = DBManager.get_name(m)
                nodes_[m]['label'] = name

            nodes = nodes_.values()

            for path in subgraph:
                if len(path) == 1:
                    t = path[0]
                    links.append({
                        'source': nodes_[t[0]]['name'],
                        'target': nodes_[t[2]]['name'],
                        'weight': 2,
                        'name': t[1]
                    })
                else:
                    t = path[0]
                    links.append({
                        'source': nodes_[t[0]]['name'],
                        'target': nodes_[t[2]]['name'],
                        'weight': 2,
                        'name': t[1]
                    })
                    t = path[1]
                    links.append({
                        'source': nodes_[t[0]]['name'],
                        'target': nodes_[t[2]]['name'],
                        'weight': 2,
                        'name': t[1]
                    })
            print 'node', nodes
            print 'links', links

            return json.dumps({'nodes': nodes, 'links': links})
Ejemplo n.º 11
0
 def __init__(self, colletion=None):
     if not None:
         self.__dbm_tweets = DBManager(colletion)
     self.__dbm_users = DBManager('users')
     self.__dbm_networks = DBManager('networks')
     self.__network = []
Ejemplo n.º 12
0
class NetworkAnalyzer:
    __dbm_tweets = None
    __dbm_users = None
    __dbm_networks = None
    __network = None
    __graph = None
    __nodes = set()
    __unknown_users = set()
    __node_sizes = None

    def __init__(self, colletion=None):
        if not None:
            self.__dbm_tweets = DBManager(colletion)
        self.__dbm_users = DBManager('users')
        self.__dbm_networks = DBManager('networks')
        self.__network = []

    def __computer_ff_ratio(self, friends, followers):
        if followers > 0 and friends > 0:
            return friends / followers
        else:
            return 0

    # Get interactions in of a given users
    def get_in_interactions(self, user_screen_name):
        # compute in interactions, meaning, interactions in which the user
        # was mentioned, retweeted, quoted, replied
        in_inter_query = {
            'interactions.' + user_screen_name: {
                '$exists': 1
            },
            'screen_name': {
                '$ne': user_screen_name
            }
        }
        n_users = self.__dbm_users.search(in_inter_query)
        in_interactions_dict, in_rts, in_rps = {}, {}, {}
        in_qts, in_mts = {}, {}
        total_in_interactions = 0
        total_in_retweets, total_in_replies = 0, 0
        total_in_mentions, total_in_quotes = 0, 0
        for n_user in n_users:
            n_user_interactions = n_user['interactions']
            for i_user, interactions in n_user_interactions.items():
                if i_user == user_screen_name:
                    in_interactions_dict[
                        n_user['screen_name']] = interactions['total']
                    total_in_interactions += interactions['total']
                    if 'retweets' in interactions.keys():
                        total_in_retweets += interactions['retweets']
                        in_rts[
                            n_user['screen_name']] = interactions['retweets']
                    if 'replies' in interactions.keys():
                        total_in_replies += interactions['replies']
                        in_rps[n_user['screen_name']] = interactions['replies']
                    if 'mentions' in interactions.keys():
                        total_in_mentions += interactions['mentions']
                        in_mts[
                            n_user['screen_name']] = interactions['mentions']
                    if 'quotes' in interactions.keys():
                        total_in_quotes += interactions['quotes']
                        in_qts[n_user['screen_name']] = interactions['quotes']
        in_interactions_obj = {
            'total': {
                'count': total_in_interactions,
                'details': in_interactions_dict
            },
            'replies': {
                'count': total_in_replies,
                'details': in_rps
            },
            'retweets': {
                'count': total_in_retweets,
                'details': in_rts
            },
            'mentions': {
                'count': total_in_mentions,
                'details': in_mts
            },
            'quotes': {
                'count': total_in_quotes,
                'details': in_qts
            }
        }
        user_dict = {'in_interactions': in_interactions_obj}
        return user_dict

    # Get interactions out of a given users
    def get_out_interactions(self, user_screen_name):
        user = self.__dbm_users.search({'screen_name': user_screen_name})[0]
        # compute out interactions, meaning, interactions originated by
        # the user
        user_interactions = user['interactions']
        out_interactions_dict, out_rts = {}, {}
        out_rps, out_qts, out_mts = {}, {}, {}
        total_out_interactions, total_out_retweets = 0, 0
        total_out_mentions, total_out_replies = 0, 0
        total_out_quotes = 0
        for recipient, interactions in user_interactions.items():
            out_interactions_dict[recipient] = interactions['total']
            total_out_interactions += interactions['total']
            if 'retweets' in interactions:
                total_out_retweets += interactions['retweets']
                out_rts[recipient] = interactions['retweets']
            if 'replies' in interactions:
                total_out_replies += interactions['replies']
                out_rps[recipient] = interactions['replies']
            if 'mentions' in interactions:
                total_out_mentions += interactions['mentions']
                out_mts[recipient] = interactions['mentions']
            if 'quotes' in interactions:
                total_out_quotes += interactions['quotes']
                out_qts[recipient] = interactions['quotes']
        out_interactions_obj = {
            'total': {
                'count': total_out_interactions,
                'details': out_interactions_dict
            },
            'replies': {
                'count': total_out_replies,
                'details': out_rps
            },
            'retweets': {
                'count': total_out_retweets,
                'details': out_rts
            },
            'mentions': {
                'count': total_out_mentions,
                'details': out_mts
            },
            'quotes': {
                'count': total_out_quotes,
                'details': out_qts
            }
        }
        # compile all information in a dictionary
        user_dict = {'out_interactions': out_interactions_obj}
        return user_dict

    def create_users_db(self, clear_collection=False):
        logging.info(
            '::. Network Analyzer: Creating database of users, it can take several minutes, please wait_'
        )
        if clear_collection:
            self.__dbm_users.clear_collection()
        users = self.__dbm_tweets.get_unique_users()
        users_count = len(users)
        logging.info(
            '::. Network Analyzer: Extracted {0} unique users from the database...'
            .format(users_count))
        progress = 1
        for user in users:
            db_user = {
                'screen_name':
                user['screen_name'],
                'friends':
                user['friends'],
                'followers':
                user['followers'],
                'ff_ratio':
                self.__computer_ff_ratio(user['friends'], user['followers']),
                'interactions':
                user['interactions'],
                'tweets':
                user['tweets_count'],
                'original_tweets':
                user['original_count'],
                'rts':
                user['retweets_count'],
                'qts':
                user['quotes_count'],
                'rps':
                user['replies_count'],
                'verified':
                user['verified']
            }
            filter_query = {'screen_name': user['screen_name']}
            logging.debug(
                '::. Network Analyzer: Updating/creating user {0} ({1}/{2})...'
                .format(user['screen_name'], progress, users_count))
            progress += 1
            self.__dbm_users.update_record(filter_query,
                                           db_user,
                                           create_if_doesnt_exist=True)

    def generate_network(self,
                         subnet_query={},
                         depth=1,
                         file_name='network',
                         override_net=False):
        net_query = subnet_query.copy()
        net_query.update({'depth': depth})
        ret_net = self.__dbm_networks.search(net_query)
        # the net doesn't exist yet, let's create it
        if ret_net.count() == 0 or override_net:
            logging.info(
                'Generating the network, it can take several minutes, please wait_'
            )
            users = self.__dbm_users.search(subnet_query)
            # for each user generate his/her edges
            for user in users:
                if 'ff_ratio' in user.keys():
                    u_ff_ratio = user['ff_ratio']
                else:
                    u_ff_ratio = self.__computer_ff_ratio(
                        user['friends'], user['followers'])
                exists = user['exists'] if 'exists' in user.keys() else ''
                self.__nodes.add(
                    tuple({
                        'screen_name': user['screen_name'],
                        'ff_ratio': u_ff_ratio,
                        'exists': exists
                    }.items()))
                for interacted_user, interactions in user[
                        'interactions'].items():
                    iuser = self.__dbm_users.find_record(
                        {'screen_name': interacted_user})
                    if not iuser:
                        if depth > 1:
                            iuser_ffratio = self.__get_ffratio(interacted_user)
                            if not iuser_ffratio:
                                self.__unknown_users.add(interacted_user)
                                continue
                        else:
                            self.__unknown_users.add(interacted_user)
                            continue
                    else:
                        if 'ff_ratio' in iuser.keys():
                            i_ff_ratio = iuser['ff_ratio']
                        else:
                            i_ff_ratio = self.__computer_ff_ratio(
                                iuser['friends'], iuser['followers'])
                    exists_iuser = iuser['exists'] if 'exists' in iuser.keys(
                    ) else ''

                    self.__nodes.add(
                        tuple({
                            'screen_name': iuser['screen_name'],
                            'ff_ratio': i_ff_ratio
                        }.items()))
                    edge = {
                        'nodeA': {
                            'screen_name': user['screen_name'],
                            'ff_ratio': u_ff_ratio,
                            'exists': exists
                        },
                        'nodeB': {
                            'screen_name': interacted_user,
                            'ff_ratio': i_ff_ratio,
                            'exists': exists_iuser
                        },
                        'weight': interactions['total']
                    }
                    self.__network.append(edge)
            logging.info('Created a network of {0} nodes and {1} edges'.format(
                len(self.__nodes), len(self.__network)))
            logging.info('Unknown users {0}'.format(len(self.__unknown_users)))
            # save the net in a gefx file for posterior usage
            f_name = self.save_network_in_gexf_format(file_name)
            logging.info('Saved the network in the file {0}'.format(f_name))
            db_net = {'file_name': str(f_name)}
            db_net.update(net_query)
            self.__dbm_networks.save_record(db_net)
        else:
            f_net = ret_net[0]
            logging.info(
                'The network was already generated, please find it at {0}'.
                format(f_net['file_name']))

    def create_graph(self):
        logging.info('Creating the graph, please wait_')
        self.__graph = net.DiGraph()
        ff_ratio = defaultdict(lambda: 0.0)
        # create a directed graph from the edge data and populate a dictionary
        # with the friends/followers ratio
        for edge in self.__network:
            user = edge['nodeA']['screen_name']
            interacted_with = edge['nodeB']['screen_name']
            num_interactions = edge['weight']
            u_ff_ratio = edge['nodeA']['ff_ratio']
            self.__graph.add_edge(user,
                                  interacted_with,
                                  weight=int(num_interactions))
            ff_ratio[user] = float(u_ff_ratio)
        # obtain central node
        # degrees = net.degree(self.__graph)
        # central_node, max_degree = sorted(degrees, key=itemgetter(1))[-1]
        # center the graph around the central node
        # ego_graph = net.DiGraph(net.ego_graph(self.__graph, central_node))
        return

    def get_graph_nodes(self):
        return len(self.__nodes)

    def get_graph_edges(self):
        return len(self.__network)

    def get_graph(self):
        return self.__graph

    def get_node_sizes(self):
        return self.__node_sizes

    def __get_ffratio(self, screen_name):
        query = {
            '$or': [{
                'user.screen_name': screen_name
            }, {
                'retweeted_status.user.screen_name': screen_name
            }, {
                'quoted_status.user.screen_name': screen_name
            }]
        }
        tweet_obj = self.__dbm_tweets.find_record(query)
        if tweet_obj:
            tweet = tweet_obj['tweet_obj']
            if 'retweeted_status' in tweet.keys():
                return self.__computer_ff_ratio(
                    tweet['retweeted_status']['user']['friends_count'],
                    tweet['retweeted_status']['user']['followers_count'])
            elif 'quoted_status' in tweet.keys():
                return self.__computer_ff_ratio(
                    tweet['quoted_status']['user']['friends_count'],
                    tweet['quoted_status']['user']['followers_count'])
            else:
                return self.__computer_ff_ratio(
                    tweet['user']['friends_count'],
                    tweet['user']['followers_count'])
        else:
            return None

    def save_network_in_gexf_format(self, file_name):
        today = datetime.strftime(datetime.now(), '%m/%d/%y')
        f_name = pathlib.Path(__file__).parents[1].joinpath(
            'sna', 'gefx', file_name + '_' + today + '.gexf')

        with open(str(f_name), 'w', encoding='utf-8') as f:
            f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
            f.write(
                '<gexf xmlns="http://www.gexf.net/1.2draft" xmlns:viz="http://www.gexf.net/1.1draft/viz" '
                'xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" '
                'xsi:schemaLocation="http://www.gexf.net/1.2draft http://www.gexf.net/1.2draft/gexf.xsd" '
                'version="1.2">\n')
            f.write('<meta lastmodifieddate="{0}">\n'.format(today))
            f.write('<creator>NetworkAnalysis</creator>\n')
            f.write('<description>{0}</description>\n'.format(file_name))
            f.write('</meta>\n')
            f.write('<graph mode="static" defaultedgetype="directed">\n')
            # add data attributes
            f.write('<attributes class="node">\n')
            f.write('<attribute id="2" title="ff_ratio" type="float"/>\n')
            f.write('<attribute id="5" title="exists" type="float"/>\n')
            f.write('</attributes>\n')
            # add nodes
            f.write('<nodes>\n')
            node_id = 0
            list_nodes = []
            for node_tup in self.__nodes:
                node = dict(node_tup)
                f.write('<node id="{0}" label="{1}">\n'.format(
                    node_id, node['screen_name']))
                f.write('<attvalues>\n')
                f.write('<attvalue for="2" value="{0}"/>\n'.format(
                    node['ff_ratio']))
                f.write('</attvalues>\n')
                #f.write('<viz:size value="{0}"/>\n'.format(node['ff_ratio']))
                f.write('</node>\n')
                node_id += 1
                list_nodes.append(node['screen_name'])
            f.write('</nodes>\n')
            # add edges
            f.write('<edges>\n')
            edge_id = 0
            for edge in list(self.__network):
                id_vertexA = list_nodes.index(edge['nodeA']['screen_name'])
                id_vertexB = list_nodes.index(edge['nodeB']['screen_name'])
                weight = edge['weight']
                f.write(
                    '<edge id="{0}" source="{1}" target="{2}" weight="{3}"/>\n'
                    .format(edge_id, id_vertexA, id_vertexB, weight))
                edge_id += 1
            f.write('</edges>\n')
            f.write('</graph>\n')
            f.write('</gexf>\n')
        return f_name
Ejemplo n.º 13
0
class FaceRecognition(QMainWindow):
    def __init__(self):
        super(FaceRecognition, self).__init__()
        loadUi('ui/main.ui', self)

        self.webcam = WebCam(self.lblCamera, QTimer(self), self.recognize_each_frame)
        self.webcam.start()

        self.db_manager = DBManager()
        self.recognizer = Recognizer(self.db_manager, transform)
        self.btRecognize.clicked.connect(self.recognize)
        self.btAddToDB.clicked.connect(self.add_to_db)

        self.lblres = ImageWidget(self.lblResult)
        self.img = None
        self.clear_result_fields()
        self.btReset.clicked.connect(self.clear_result_fields)

    '''
    Args:
        img(array): current frame
    Is called by WebCam object each frame.
    Updates img field so that we can access it in other methods of class.
    Returns:
        If display box is checked, recognize and draws box and person data on frame.
        If not, return img without any transforms.
    '''
    def recognize_each_frame(self, img):
        self.img = img
        if self.img is None:
            self.clear_result_fields()
            return

        if self.btDisplay.isChecked():
            clients, locations = self.recognizer.recognize(self.img)
            clients = list(clients)
            img = self.recognizer.draw_results(img, locations, clients)

        return img

    '''
    Do recognition on the frame at specific moment of time. 
    Updates results fields.
    '''
    def recognize(self):
        #clear fields
        self.clear_result_fields()

        #do recognition
        clients, _ = self.recognizer.recognize(self.img)
        clients = list(clients)

        self.btReset.setEnabled(True)

        #if no match - person is Unknown.
        if len(clients) == 0:
            self.fill_result_fields('Unknown', noname_img)
            return 

        #get first founded result (expected to have only one person on frame)
        self.fill_result_fields(self.db_manager.get_fullname(clients[0]), clients[0]['img_path'])

    def add_to_db(self):
        self.widget = DataDialog(self, self.img, self.db_manager)
        self.widget.show()

    def clear_result_fields(self):
        self.btReset.setEnabled(False)
        self.fill_result_fields('', noname_img)

    def fill_result_fields(self, text, path):
    #type: (string, string)
        self.lblName.setText(text)
        img = cv2.imread(path)
        self.lblres.displayImage(img)
Ejemplo n.º 14
0
def do_update_collection(collection_name,
                         source_collection,
                         end_date,
                         start_date=None,
                         config_fn=None):
    dbm_weekly_collection = DBManager(config_fn=config_fn)
    # Create collection if does not exists
    created_collection = dbm_weekly_collection.create_collection(
        collection_name)
    if created_collection:
        logging.info('Creating collection: {}...'.format(collection_name))
        dbm_weekly_collection.create_index('id', 'asc', unique=True)
        logging.info('Creating index: id...')
    else:
        logging.info(
            'Setting collection of database to {}'.format(collection_name))
        dbm_weekly_collection.set_collection(collection_name)
    dbm_source = DBManager(collection=source_collection, config_fn=config_fn)
    # If no start date is passed, then use today's date
    if not start_date:
        start_date = datetime.today().strptime('%Y-%m-%d')
    query = {'created_at_date': {'$gte': start_date, '$lte': end_date}}
    logging.info('Searching for tweets between {0} and {1}...'.\
                 format(start_date, end_date))
    tweets_to_copy = dbm_source.search(query)
    logging.info('Going to insert {0:,} tweets into the collection {1}'.\
                 format(tweets_to_copy.count(), collection_name))
    try:
        ret_insertions = dbm_weekly_collection.insert_many_tweets(
            tweets_to_copy, ordered=False)
        insertion_counter = ret_insertions.inserted_ids
        logging.info('{0:,} new tweets were inserted into the collection {1}'.\
                     format(insertion_counter, collection_name))
    except Exception as e:
        logging.error('Error when merging {}'.format(e))
Ejemplo n.º 15
0
def gen_unsolved_sentence(fn_in, fn_out):
    avg_candidate = 0
    num = 0
    with open(fn_in) as fin, open(fn_out, 'w') as fout:
        for line in fin:
            data = json.loads(line, encoding='utf8')
            gold_entity = data['entity']
            surfaces = data['predict'].split("\t")

            candidates = dict()
            for surface in surfaces:
                surface = surface.lower().replace(' ', '')
                res = DBManager.get_candidate_entities(surface, 0.1)

                for e in res:
                    if e[0] not in candidates or e[1] > candidates[e[0]]:
                        candidates[e[0]] = e[1]
            if len(candidates) == 0:
                sentence = [w.split('|')[0]
                            for w in data['tag_res'].split()][1:-1]
                if 'pos' in data:
                    all_pos = data['pos'][1:-1]
                else:
                    all_pos = None
                # use ngram of surface
                for surface in surfaces:
                    surface = surface.lower().split()
                    if len(surface) == 0:
                        continue
                    start = find_word(sentence, surface)
                    if start == -1:
                        continue
                    l = len(surface)
                    found = False
                    for j in range(l, 0, -1):
                        # if found:
                        #     break
                        for i in range(l - j + 1):
                            if 'pos' not in data or is_entity_occurrence(
                                    all_pos, sentence, start + i,
                                    start + i + j):
                                s = ''.join(surface[i:i + j])
                                res = DBManager.get_candidate_entities(s, 0.1)
                                for e in res:
                                    if e[1] < 1.1 and (
                                            e[0] not in candidates
                                            or e[1] > candidates[e[0]]):
                                        candidates[e[0]] = e[1]
                            found = len(res) > 0
            # candidates = sorted(candidates.items(), key=lambda x:x[1], reverse=True)[:20]
            candidates = candidates.items()
            correct = False
            for e, _ in candidates:
                if e == gold_entity:
                    avg_candidate += len(candidates)
                    num += 1
                    correct = True
                    break
            print >> fout, ("%s\t%s" % (gold_entity, ' '.join(
                [c for c, _ in candidates]))).encode('utf8')
            if not correct:
                # print >> fout, line.strip(), candidates
                print surfaces, data['gold'].split('\t'), gold_entity
            # else:
            # print line.strip()
            # print candidates
    print "%s find correct topic entity" % num
    print "average number of candidate entities: %s" % (avg_candidate * 1.0 /
                                                        num)