def on_process(self, data_frames):
        data = []
        columns = ['page_id', 'user_name', 'contribution_similarity']

        esa = EsaProvider(self.data_dir)

        revs_df = data_frames[0]
        assert isinstance(revs_df, pd.DataFrame)

        with pymongo.MongoClient(host=config.get('MONGO', 'host'),
                                 port=config.get_int('MONGO',
                                                     'port')) as client:
            db = client.get_database(config.get('MONGO', 'database'))
            collection = db.get_collection('pages')

            persistence_db = client.get_database(
                config.get('MONGO', 'persistence_database'))
            persistence_collection = persistence_db.get_collection(
                config.get('MONGO', 'persistence_collection'))

            grouped = revs_df.groupby(by=['page_id', 'user_name'])
            for (page_id, user_name), group in grouped:
                if is_bot(user_name):
                    continue

                user_persists = {}
                for index, row in group.iterrows():
                    rev_id = row['revision_id']

                    rev_pers = self.get_persistence(
                        rev_id=rev_id, collection=persistence_collection)
                    if isinstance(rev_pers, RevisionPersistence):
                        if user_name not in user_persists:
                            user_persists[user_name] = []
                        user_persists[user_name].append(rev_pers)

                if user_name not in user_persists:
                    contribution_similarity = .0
                else:
                    all_content_tokens = []
                    for rev_pers in user_persists[user_name]:
                        if isinstance(rev_pers, RevisionPersistence):
                            all_content_tokens.extend(rev_pers.content_tokens)

                    if len(all_content_tokens) == 0:
                        contribution_similarity = .0
                    else:
                        page_json = collection.find_one(
                            filter={'_id': int(page_id)})
                        if page_json is None:
                            continue

                        summary = str(page_json['text'][:300])
                        contribution_similarity = esa.get_semantic_similarity(
                            summary, ' '.join(all_content_tokens))

                data.append([page_id, user_name, contribution_similarity])

        return pd.DataFrame(data=data, columns=columns)
    def get_revisions(self, site, page_id, talk_page_id, bots):
        data = []
        limit = 50000
        rv_continue = None
        total_revisions = 0

        while True:
            json_data = self.__get_revisions(site, talk_page_id, rv_continue)
            pages = json_data['query']['pages']
            cont_dictionary = json_data.get('continue')

            if str(talk_page_id) in pages:
                page = pages[str(talk_page_id)]
                if 'revisions' not in page:
                    continue

                revisions = page['revisions']

                for rev in revisions:
                    total_revisions += 1
                    if 'suppressed' in rev.keys() or 'userhidden' in rev.keys(
                    ):
                        continue

                    user_id = parse_string(rev['userid'])
                    user_name = parse_string(rev['user'])
                    size = parse_int(rev['size'])
                    rev_id = parse_int(rev['revid'])
                    timestamp = None
                    if 'timestamp' in rev:
                        timestamp = parse_timestamp(rev['timestamp'])

                    if is_bot(user_name):
                        continue

                    data.append([
                        page_id, talk_page_id, rev_id, user_id, user_name,
                        size, timestamp
                    ])

            if cont_dictionary and total_revisions < limit:
                rv_continue = str(cont_dictionary['rvcontinue'])
            else:
                break

        return pd.DataFrame(data=data,
                            columns=[
                                'page_id', 'talk_page_id', 'revision_id',
                                'user_id', 'user_name', 'size', 'timestamp'
                            ])
Example #3
0
    def on_process(self, data_frames):
        revs_df = data_frames[0]
        assert isinstance(revs_df, pd.DataFrame)
        top10_pages_df = data_frames[1]
        assert isinstance(top10_pages_df, pd.DataFrame)

        data = []
        columns = ['page_id', 'user_name', 'links_overlap']
        with pymongo.MongoClient(host=config.get('MONGO', 'host'),
                                 port=config.get_int('MONGO',
                                                     'port')) as client:
            db = client.get_database(config.get('MONGO', 'database'))
            collection = db.get_collection('page_links')

            grouped = revs_df.groupby(by=['page_id', 'user_name'])
            for (page_id, user_name), group in grouped:
                if is_bot(user_name):
                    continue

                cursor = collection.find(filter={'pl_from': int(page_id)})
                if cursor is None:
                    continue

                links = [entry['pl_title'] for entry in cursor]

                top10_page_ids = list(top10_pages_df[
                    top10_pages_df['user_name'] == user_name]['page_id'])
                top10_page_ids = list(
                    filter(lambda x: x != page_id, top10_page_ids))

                links_overlaps = []
                for k_page_id in top10_page_ids:
                    cursor = collection.find(
                        filter={'pl_from': int(k_page_id)})
                    if cursor is None:
                        continue

                    k_links = [entry['pl_title'] for entry in cursor]

                    links_overlap = (float(len(set(links).intersection(k_links))) / len(
                        set(links).union(k_links))) \
                        if (len(links) > 0 or len(k_links)) > 0 else 0.0
                    links_overlaps.append(links_overlap)

                mean_links_overlap = np.mean(
                    links_overlaps) if len(links_overlaps) > 0 else 0.0
                data.append([page_id, user_name, mean_links_overlap])

        return pd.DataFrame(data=data, columns=columns)
Example #4
0
    def on_process(self, data_frames):
        wiki_launch_date = pytz.utc.localize(
            datetime(year=2001, month=1, day=15))
        current_date = pytz.utc.localize(datetime.utcnow().replace(
            hour=0, minute=0, second=0, microsecond=0))
        normalization_factor = (current_date -
                                wiki_launch_date).total_seconds()

        revs_df = data_frames[0]

        data = []
        columns = ['user_name', 'tenure']
        if isinstance(revs_df, pd.DataFrame):
            user_names = revs_df['user_name'].unique()

            with pymongo.MongoClient(host=config.get('MONGO', 'host'),
                                     port=config.get_int('MONGO',
                                                         'port')) as client:
                db = client.get_database(config.get('MONGO', 'database'))
                collection = db.get_collection(
                    config.get('MONGO', 'collection'))

                for user_name in user_names:
                    if is_bot(user_name):
                        continue

                    first_edit, last_edit = self.aggregate(
                        collection=collection, user_name=user_name)

                    if first_edit is None or last_edit is None:
                        continue

                    tenure = (last_edit - first_edit
                              ).total_seconds() / normalization_factor
                    data.append([user_name, tenure])
                    logging.debug('Username: {}\tTenure: {}'.format(
                        user_name, tenure))

        tenure_df = pd.DataFrame(data=data, columns=columns)
        data = []
        cols = ['page_id', 'user_name', 'tenure']
        df = revs_df.merge(tenure_df, how='left', on='user_name')[cols]
        for (page_id,
             user_name), group in df.groupby(by=['page_id', 'user_name']):
            data.append([page_id, user_name, group.iloc[0]['tenure']])

        return pd.DataFrame(data=data, columns=cols)
Example #5
0
    def on_process(self, data_frames):
        host = config.get('MONGO', 'host')
        port = config.get_int('MONGO', 'port')
        database = config.get('MONGO', 'database')
        collection = config.get('MONGO', 'collection')

        revs_df = data_frames[0]

        data = []
        columns = ['user_name', 'total_edited_pages']
        if isinstance(revs_df, pd.DataFrame):
            user_names = revs_df['user_name'].unique()

            with pymongo.MongoClient(host=host, port=port) as client:
                db = client.get_database(database)
                collection = db.get_collection(collection)

                for user_name in user_names:
                    if is_bot(user_name):
                        continue

                    total_edited_pages = self.aggregate(collection=collection,
                                                        user_name=user_name)

                    if total_edited_pages is None:
                        continue

                    data.append([user_name, total_edited_pages])
                    logging.debug(
                        'Username: {}\tTotal edited pages: {}'.format(
                            user_name, total_edited_pages))

        df = pd.DataFrame(data=data, columns=columns)
        # normalization_factor = df['total_edited_pages'].max()
        # df['total_edited_pages'] = df['total_edited_pages'].apply(lambda x: float(x) / normalization_factor)

        data = []
        cols = ['page_id', 'user_name', 'total_edited_pages']
        df = revs_df.merge(df, how='left', on='user_name')[cols]
        for (page_id,
             user_name), group in df.groupby(by=['page_id', 'user_name']):
            data.append(
                [page_id, user_name, group.iloc[0]['total_edited_pages']])

        return pd.DataFrame(data=data, columns=cols)
Example #6
0
    def on_process(self, data_frames):
        revs_df = data_frames[0]

        data = []
        columns = ['user_name', 'gender']
        if isinstance(revs_df, pd.DataFrame):
            user_names = list(revs_df['user_name'].unique())

            user_names = list(filter(lambda x: not is_bot(x), user_names))

            # non_bot_users = []
            # for user_name in user_names:
            #     if is_bot(user_name):
            #         continue
            #     else:
            #         non_bot_users.append(user_name)

            for user in user_names:
                resp_users = site.users(users=[str(user)], prop=['gender'])
                for resp_user in resp_users:
                    if 'missing' in resp_user or 'gender' not in resp_user:
                        gender = -1.
                        data.append([user, gender])
                    else:
                        gender = 1. if resp_user['gender'] == 'male' else (
                            -1. if resp_user['gender'] is None else 0.)
                        data.append([user, gender])
                    logging.debug('Username: {}\tGender: {}'.format(
                        user, gender))
                    break

        tenure_df = pd.DataFrame(data=data, columns=columns)
        data = []
        cols = ['page_id', 'user_name', 'gender']
        df = revs_df.merge(tenure_df, how='left', on='user_name')[cols]
        for (page_id,
             user_name), group in df.groupby(by=['page_id', 'user_name']):
            data.append([page_id, user_name, group.iloc[0]['gender']])

        return pd.DataFrame(data=data, columns=cols)
Example #7
0
    def run(self):
        revs_df = pd.read_hdf(self.input()[0].path, mode='r')
        assert isinstance(revs_df, pd.DataFrame)
        user_names = list(revs_df['user_name'].unique())
        user_names = list(filter(lambda x: not is_bot(x), user_names))

        with pymongo.MongoClient(host=config.get('MONGO', 'host'),
                                 port=config.get_int('MONGO',
                                                     'port')) as client:
            db = client.get_database(config.get('MONGO', 'database'))
            collection = db.get_collection(config.get('MONGO', 'collection'))

            data = []
            columns = ['page_id', 'user_name']
            for user_name in user_names:
                page_ids = self.aggregate(collection, user_name)
                for page_id in page_ids:
                    data.append([page_id, user_name])

            pages_df = pd.DataFrame(data=data, columns=columns)
            pages_df.to_hdf(os.path.join(self.data_dir, self.file_name),
                            key='df',
                            mode='w')
Example #8
0
    def on_process(self, data_frames):
        revs_df = data_frames[0]

        data = []
        ns_columns = ['ns{}_edit_dist'.format(ns) for ns in range(0, 16, 1)]
        columns = ['user_name'] + ns_columns
        if isinstance(revs_df, pd.DataFrame):
            user_names = revs_df['user_name'].unique()

            with pymongo.MongoClient(host=config.get('MONGO', 'host'),
                                     port=config.get_int('MONGO',
                                                         'port')) as client:
                db = client.get_database(config.get('MONGO', 'database'))
                collection = db.get_collection(
                    config.get('MONGO', 'collection'))

                for user_name in user_names:
                    if is_bot(user_name):
                        continue

                    namespaces = self.aggregate(collection=collection,
                                                user_name=user_name)
                    data.append([user_name] + namespaces)
                    logging.debug('Username: {}\tNamespaces: {}'.format(
                        user_name, namespaces))

        ns_df = pd.DataFrame(data=data, columns=columns)
        data = []
        cols = ['page_id', 'user_name'] + ns_columns
        df = revs_df.merge(ns_df, how='left', on='user_name')[cols]

        for (page_id,
             user_name), group in df.groupby(by=['page_id', 'user_name']):
            data.append([page_id, user_name] + list(group.iloc[0][ns_columns]))

        return pd.DataFrame(data=data, columns=cols)
    def on_process(self, data_frames):
        data = []
        columns = [
            'page_id', 'user_name', 'title_similarity', 'summary_similarity'
        ]

        esa = EsaProvider(self.data_dir)

        revs_df = data_frames[0]
        assert isinstance(revs_df, pd.DataFrame)
        top10_pages_df = data_frames[1]
        assert isinstance(top10_pages_df, pd.DataFrame)

        with pymongo.MongoClient(host=config.get('MONGO', 'host'),
                                 port=config.get_int('MONGO',
                                                     'port')) as client:
            db = client.get_database(config.get('MONGO', 'database'))
            collection = db.get_collection('pages')

            grouped = revs_df.groupby(by=['page_id', 'user_name'])
            for (page_id, user_name), group in grouped:
                if is_bot(user_name):
                    continue

                # logging.info(f'Page ID: {page_id}\tUsername: {user_name}')
                top10_page_ids = list(top10_pages_df[
                    top10_pages_df['user_name'] == user_name]['page_id'])
                top10_page_ids = list(
                    filter(lambda x: x != page_id, top10_page_ids))

                page_json = collection.find_one(filter={'_id': int(page_id)})
                if page_json is None:
                    continue

                title = str(page_json['title'])
                text = str(page_json['text'][:1000])

                title_similarity_scores = []
                summary_similarity_scores = []
                for k_page_id in top10_page_ids:
                    k_page_json = collection.find_one(
                        filter={'_id': int(k_page_id)})

                    if k_page_json is None:
                        continue

                    if 'title' in k_page_json:
                        k_title = str(k_page_json['title'])
                        title_sim = esa.get_semantic_similarity(title, k_title)
                        title_similarity_scores.append(
                            0.0 if title_sim is None else title_sim)

                    if 'text' in k_page_json:
                        k_text = str(k_page_json['text'][:1000])
                        text_sim = esa.get_semantic_similarity(text,
                                                               k_text,
                                                               long_text=True)
                        summary_similarity_scores.append(
                            0.0 if text_sim is None else text_sim)

                f_title_similarity = np.mean(title_similarity_scores) if len(
                    title_similarity_scores) > 0 else 0.0
                f_summary_similarity = np.mean(
                    summary_similarity_scores
                ) if len(summary_similarity_scores) > 0 else 0.0

                data.append([
                    page_id, user_name, f_title_similarity,
                    f_summary_similarity
                ])

        return pd.DataFrame(data=data, columns=columns)