Beispiel #1
0
 def multi_process_generate(self):
     L = len(self.class_list)
     step = (L // self.process_num) + 1
     args = [(self.run_similarity,
              self.class_list[i * step:min((i + 1) * step, L)], i)
             for i in range(self.process_num)]  # USing COpy
     multiprocess(self.iterate_f, args, self.process_num)
Beispiel #2
0
    def multi_process_generate(self):

        L = len(self.class_list)
        step = (L // self.process_num) + 1
        args = [(self.run_examplar,
                 self.class_list[i * step:min((i + 1) * step, L)],
                 self.random_fea_dict, i)
                for i in range(self.process_num)]  # USing COpy
        multiprocess(self.iterate_f, args, self.process_num)
Beispiel #3
0
def parse_coauthor(file):
    """ Parse & convert coauthor into Dataframe.

        Args:
            -- file: coauthor file address, encoding in utf-8.
                coauthor file: https://lfs.aminer.cn/lab-datasets/aminerdataset/AMiner-Coauthor.zip

        ETA 10min
        """
    with open(file, encoding='utf-8') as f:
        data = f.readlines()

    def process(d):
        df = pd.DataFrame(columns=['1st', '2nd', 'num'])
        for c in d:
            c = c.lstrip('#')
            c = c.rstrip('\n')
            df = df.append(
                {col: val
                 for col, val in zip(df.columns, c.split('\t'))},
                ignore_index=True)
        return df

    coauthor_df = multiprocess(process, split_data(data, size=2000))
    coauthor_df['num'] = coauthor_df['num'].astype('int64')
    return coauthor_df
Beispiel #4
0
def region_str(author_df, by='country'):
    df = author_df.dropna(subset=[by]).copy().reset_index(drop=True)
    df = df[['n_cites', by]]

    def h_index(df):
        processed_df = df.sort_values(
            by=['n_cites'], ascending=False).copy().reset_index(drop=True)
        index = processed_df.shape[0]
        for i in range(0, processed_df.shape[0]):
            if int(processed_df['n_cites'][i]) <= i:
                index = i
        return pd.DataFrame({by: [df[by].values[0]], 'h_index': [index]})

    # h_index(split_data(df, by=by)[0])
    reg_str = multiprocess(h_index, split_data(df, by=by), n_jobs=12)
    return reg_str
Beispiel #5
0
def parsetxt2df(file, prefix_lst, columns):
    """ Parse & convert txt into Dataframe.

    Args:
        -- file: txt file address, encoding in utf-8.
            paper data: https://lfs.aminer.cn/lab-datasets/aminerdataset/AMiner-Paper.rar
            author data: https://lfs.aminer.cn/lab-datasets/aminerdataset/AMiner-Author.zip
        -- prefix_lst: prefix list, prefix indicates data type of lines.
        -- columns: column list, Dataframe columns, in same shape as prefix_lst.

    Return:
        Converted Dataframe, needs further process.

    ETA
        - 2min for author data
        - 15min for paper data
    """
    with open(file, encoding='utf-8') as f:
        data = f.readlines()
    # WE HAVE TO FILTER OUT EVERY ROW WITHOUT OCCURRENCE OF REQUIRED PREFIX SO THAT COLUMN LENGTH CAN MATCH
    data = [d for d in data if any(d.startswith(p) for p in prefix_lst)]

    def process(d):
        def parse(prefix):
            if prefix == '#%':
                processed_lst = list()
                for l in d:
                    if l.startswith('#index'):
                        processed_lst.append(list())
                    elif l.startswith('#%'):
                        processed_lst[-1].append(strprocess(l, '#%'))

                return [';'.join(lst) for lst in processed_lst]
            else:
                return [
                    strprocess(s, prefix) for s in d if s.startswith(prefix)
                ]

        return pd.DataFrame(
            {col: parse(prefix)
             for col, prefix in zip(columns, prefix_lst)})

    size = 100000
    if '#%' in prefix_lst:
        return process(data)
    processed_df = multiprocess(process, split_data(data, size=size))
    return processed_df
Beispiel #6
0
def cooperate_strength(coauthor_df, author_df, by='country'):
    """ Calculate cooperate strength (times)

    Args:
        - coauthor_df: pd.Dataframe
            coauthor data
        - coauthor_df: pd.Dataframe
            author data, requires column affiliation
        - by: string, either 'country', 'city' or 'affiliation'
            dimension where authors aggregates
    Returns:
        pd.Dataframe
        1st | 2nd | str
        _______________
            |     |
    """
    assert by in ['country', 'city', 'affiliation']

    def get(id):
        res = author_df[author_df['author_id'] == id][by].values
        if len(res) > 0:
            return res[0]

    def process(df):
        coop_str = pd.DataFrame()
        coop_str['1st'] = df['1st'].apply(get)
        coop_str['2nd'] = df['2nd'].apply(get)
        coop_str['str'] = df['num']
        coop_str = coop_str.dropna(subset=['1st', '2nd'])
        return coop_str

    coop_df = multiprocess(process, split_data(coauthor_df, size=1000))

    df1 = pd.DataFrame()
    groups = coop_df.groupby(by=['1st', '2nd'])
    strength = groups['str'].apply(sum)
    df1['1st'] = [idx[0] for idx in strength.axes[0]]
    df1['2nd'] = [idx[1] for idx in strength.axes[0]]
    df1['strength'] = strength.values.astype('int')
    return df1
Beispiel #7
0
def overview(paper_df, by='country'):
    """ Data for the annual publication overview on the certain scale.

    Args:
        - paper_df: pd.Dataframe
            paper data, requires column of id, year & affiliation
        - by: string, 'country' or 'city'
            dimension where we calculate the annual publication num

    Returns:
        processed Dataframe, grouped by year & country/city

    """
    def process(df):
        df_ = df[['id', 'year', by]].copy()
        df_ = df_.dropna(subset=[by])
        grouped_paper = df_.groupby(["year", by])

        df1 = pd.DataFrame()
        count = grouped_paper["id"].apply(len)
        df1["year"] = [idx[0] for idx in count.axes[0]]
        df1[by] = [idx[1] for idx in count.axes[0]]
        df1["publication count"] = count.values.astype('int')

        df1['year'] = df1['year'].astype('int')
        return df1

    overview_df_ = multiprocess(process,
                                split_data(paper_df, by='year'),
                                n_jobs=12)
    grouped_overview_df = overview_df_.groupby(["year", by])

    overview_df = pd.DataFrame()
    count_sum = grouped_overview_df["publication count"].apply(sum)
    overview_df["year"] = [idx[0] for idx in count_sum.axes[0]]
    overview_df[by] = [idx[1] for idx in count_sum.axes[0]]
    overview_df["publication count"] = count_sum.values.astype('int')

    overview_df['year'] = overview_df['year'].astype('int')
    return overview_df
Beispiel #8
0
def process(df, orgs_col):
    df['affiliation'] = df[orgs_col].apply(lambda s: s.split(';')[0]
                                           if s else None)
    df['city'] = df['affiliation'].apply(aff2city)
    df['country'] = df['affiliation'].apply(aff2country)
    return df


if __name__ == '__main__':
    # dblp_paper_df = read_csv(DATA_PATH, 'dblp_paper.csv')
    # asn_paper_df = read_csv(DATA_PATH, 'asn_paper.csv')
    author_df = read_csv(DATA_PATH, 'author.csv')
    # coauthor_df = read_csv(DATA_PATH, 'coauthor.csv')

    # PROCESSED DBLP PAPER DATAFRAME, DROPPED AUTHOR ORG, CALCULATE CITY & COUNTRY
    # processed_paper_df = dblp_paper_df.dropna(subset=['authors_org']).copy().reset_index(drop=True)
    # processed_dblp_paper_df = multiprocess(lambda df: process(df, orgs_col='authors_org'),
    #                                        split_data(processed_paper_df, size=1000), n_jobs=12)
    # processed_dblp_paper_df.to_csv(os.path.join(DATA_PATH, 'processed_dblp_paper.csv'), index=False)

    # PROCESSED AUHTOR DATAFRAME, DROPPED AFFILIATIONS, CALCULATE CITY & COUNTRY
    processed_author_df_ = author_df.dropna(
        subset=['affiliations']).copy().reset_index(drop=True)
    processed_author_df = multiprocess(
        lambda df: process(df, orgs_col='affiliations'),
        split_data(processed_author_df_, size=1000),
        n_jobs=12)
    processed_author_df.to_csv(os.path.join(DATA_PATH, 'processed_author.csv'),
                               index=False)
Beispiel #9
0
def parsejson2df(file, col_lst=JSON_KEYWORDS):
    """ Parse & load json file to Dataframe.

    Data at https://originalstatic.aminer.cn/misc/dblp.v12.7z
    ** THIS FILE IS PRETTY LARGE (12GB when unzipped), the file is loaded by readline,
    HOWEVER THE JSON DATA IS CONVERTED TO A SINGLE DATAFRAME, MAY REQUIRE LARGE MEMORY.

    ETA 30min
    """
    def parse(js):
        processed_line = dict()
        for col in set(col_lst).intersection(set(js.keys())):
            # Only keywords in the list above are considered
            # If more keywords are required, according read_in code should be added
            if col == 'id':
                processed_line['id'] = js['id']
            elif col == 'authors':
                authors = js['authors']
                processed_line['authors_id'] = ';'.join([
                    str(author.get('id', None)) for author in authors
                    if author.get('id', None)
                ])
                processed_line['authors_name'] = ';'.join([
                    author.get('name', None) for author in authors
                    if author.get('name', None)
                ])
                processed_line['authors_org'] = ';'.join([
                    str(author.get('org', None)) for author in authors
                    if author.get('org', None)
                ])
            elif col == 'venue':
                venue = js['venue']
                processed_line['venue_id'] = venue.get('id', None)
                processed_line['venue_name'] = venue.get('raw', None)
            elif col == 'year':
                processed_line['year'] = js['year']
            elif col == 'keywords':
                processed_line['keywords'] = ';'.join(js['keywords'])
            elif col == 'references':
                processed_line['references'] = ';'.join(
                    [str(r) for r in js['references']])
            elif col == 'n_citation':
                processed_line['n_cites'] = js['n_citation']
            elif col == 'doc_type':
                processed_line['doc_type'] = js['doc_type']
            elif col == 'fos':
                fos = js['fos']
                processed_line['fos_name'] = ';'.join([
                    str(f.get('name', None)) for f in fos
                    if f.get('name', None)
                ])
                processed_line['fos_weight'] = ';'.join(
                    [str(f.get('w', None)) for f in fos if f.get('w', None)])
        return processed_line

    with open(file, encoding='utf-8') as f:
        lines = f.readlines()

    def process(ls):
        df = pd.DataFrame()
        for line in ls:
            line = line.lstrip('[')
            line = line.lstrip(',')
            line = line.rstrip(']')
            line = line.rstrip('\n')
            js = json.loads(line)
            df = df.append(parse(js), ignore_index=True)
        return df

    parsed_df = multiprocess(process, split_data(lines[1:-1], size=2000))

    # Change data type
    parsed_df['id'] = parsed_df['id'].astype('int64')
    parsed_df['year'] = parsed_df['year'].astype('int')
    parsed_df['n_cites'] = parsed_df['n_cites'].astype('int')

    return parsed_df