def multi_process_generate(self): L = len(self.class_list) step = (L // self.process_num) + 1 args = [(self.run_similarity, self.class_list[i * step:min((i + 1) * step, L)], i) for i in range(self.process_num)] # USing COpy multiprocess(self.iterate_f, args, self.process_num)
def multi_process_generate(self): L = len(self.class_list) step = (L // self.process_num) + 1 args = [(self.run_examplar, self.class_list[i * step:min((i + 1) * step, L)], self.random_fea_dict, i) for i in range(self.process_num)] # USing COpy multiprocess(self.iterate_f, args, self.process_num)
def parse_coauthor(file): """ Parse & convert coauthor into Dataframe. Args: -- file: coauthor file address, encoding in utf-8. coauthor file: https://lfs.aminer.cn/lab-datasets/aminerdataset/AMiner-Coauthor.zip ETA 10min """ with open(file, encoding='utf-8') as f: data = f.readlines() def process(d): df = pd.DataFrame(columns=['1st', '2nd', 'num']) for c in d: c = c.lstrip('#') c = c.rstrip('\n') df = df.append( {col: val for col, val in zip(df.columns, c.split('\t'))}, ignore_index=True) return df coauthor_df = multiprocess(process, split_data(data, size=2000)) coauthor_df['num'] = coauthor_df['num'].astype('int64') return coauthor_df
def region_str(author_df, by='country'): df = author_df.dropna(subset=[by]).copy().reset_index(drop=True) df = df[['n_cites', by]] def h_index(df): processed_df = df.sort_values( by=['n_cites'], ascending=False).copy().reset_index(drop=True) index = processed_df.shape[0] for i in range(0, processed_df.shape[0]): if int(processed_df['n_cites'][i]) <= i: index = i return pd.DataFrame({by: [df[by].values[0]], 'h_index': [index]}) # h_index(split_data(df, by=by)[0]) reg_str = multiprocess(h_index, split_data(df, by=by), n_jobs=12) return reg_str
def parsetxt2df(file, prefix_lst, columns): """ Parse & convert txt into Dataframe. Args: -- file: txt file address, encoding in utf-8. paper data: https://lfs.aminer.cn/lab-datasets/aminerdataset/AMiner-Paper.rar author data: https://lfs.aminer.cn/lab-datasets/aminerdataset/AMiner-Author.zip -- prefix_lst: prefix list, prefix indicates data type of lines. -- columns: column list, Dataframe columns, in same shape as prefix_lst. Return: Converted Dataframe, needs further process. ETA - 2min for author data - 15min for paper data """ with open(file, encoding='utf-8') as f: data = f.readlines() # WE HAVE TO FILTER OUT EVERY ROW WITHOUT OCCURRENCE OF REQUIRED PREFIX SO THAT COLUMN LENGTH CAN MATCH data = [d for d in data if any(d.startswith(p) for p in prefix_lst)] def process(d): def parse(prefix): if prefix == '#%': processed_lst = list() for l in d: if l.startswith('#index'): processed_lst.append(list()) elif l.startswith('#%'): processed_lst[-1].append(strprocess(l, '#%')) return [';'.join(lst) for lst in processed_lst] else: return [ strprocess(s, prefix) for s in d if s.startswith(prefix) ] return pd.DataFrame( {col: parse(prefix) for col, prefix in zip(columns, prefix_lst)}) size = 100000 if '#%' in prefix_lst: return process(data) processed_df = multiprocess(process, split_data(data, size=size)) return processed_df
def cooperate_strength(coauthor_df, author_df, by='country'): """ Calculate cooperate strength (times) Args: - coauthor_df: pd.Dataframe coauthor data - coauthor_df: pd.Dataframe author data, requires column affiliation - by: string, either 'country', 'city' or 'affiliation' dimension where authors aggregates Returns: pd.Dataframe 1st | 2nd | str _______________ | | """ assert by in ['country', 'city', 'affiliation'] def get(id): res = author_df[author_df['author_id'] == id][by].values if len(res) > 0: return res[0] def process(df): coop_str = pd.DataFrame() coop_str['1st'] = df['1st'].apply(get) coop_str['2nd'] = df['2nd'].apply(get) coop_str['str'] = df['num'] coop_str = coop_str.dropna(subset=['1st', '2nd']) return coop_str coop_df = multiprocess(process, split_data(coauthor_df, size=1000)) df1 = pd.DataFrame() groups = coop_df.groupby(by=['1st', '2nd']) strength = groups['str'].apply(sum) df1['1st'] = [idx[0] for idx in strength.axes[0]] df1['2nd'] = [idx[1] for idx in strength.axes[0]] df1['strength'] = strength.values.astype('int') return df1
def overview(paper_df, by='country'): """ Data for the annual publication overview on the certain scale. Args: - paper_df: pd.Dataframe paper data, requires column of id, year & affiliation - by: string, 'country' or 'city' dimension where we calculate the annual publication num Returns: processed Dataframe, grouped by year & country/city """ def process(df): df_ = df[['id', 'year', by]].copy() df_ = df_.dropna(subset=[by]) grouped_paper = df_.groupby(["year", by]) df1 = pd.DataFrame() count = grouped_paper["id"].apply(len) df1["year"] = [idx[0] for idx in count.axes[0]] df1[by] = [idx[1] for idx in count.axes[0]] df1["publication count"] = count.values.astype('int') df1['year'] = df1['year'].astype('int') return df1 overview_df_ = multiprocess(process, split_data(paper_df, by='year'), n_jobs=12) grouped_overview_df = overview_df_.groupby(["year", by]) overview_df = pd.DataFrame() count_sum = grouped_overview_df["publication count"].apply(sum) overview_df["year"] = [idx[0] for idx in count_sum.axes[0]] overview_df[by] = [idx[1] for idx in count_sum.axes[0]] overview_df["publication count"] = count_sum.values.astype('int') overview_df['year'] = overview_df['year'].astype('int') return overview_df
def process(df, orgs_col): df['affiliation'] = df[orgs_col].apply(lambda s: s.split(';')[0] if s else None) df['city'] = df['affiliation'].apply(aff2city) df['country'] = df['affiliation'].apply(aff2country) return df if __name__ == '__main__': # dblp_paper_df = read_csv(DATA_PATH, 'dblp_paper.csv') # asn_paper_df = read_csv(DATA_PATH, 'asn_paper.csv') author_df = read_csv(DATA_PATH, 'author.csv') # coauthor_df = read_csv(DATA_PATH, 'coauthor.csv') # PROCESSED DBLP PAPER DATAFRAME, DROPPED AUTHOR ORG, CALCULATE CITY & COUNTRY # processed_paper_df = dblp_paper_df.dropna(subset=['authors_org']).copy().reset_index(drop=True) # processed_dblp_paper_df = multiprocess(lambda df: process(df, orgs_col='authors_org'), # split_data(processed_paper_df, size=1000), n_jobs=12) # processed_dblp_paper_df.to_csv(os.path.join(DATA_PATH, 'processed_dblp_paper.csv'), index=False) # PROCESSED AUHTOR DATAFRAME, DROPPED AFFILIATIONS, CALCULATE CITY & COUNTRY processed_author_df_ = author_df.dropna( subset=['affiliations']).copy().reset_index(drop=True) processed_author_df = multiprocess( lambda df: process(df, orgs_col='affiliations'), split_data(processed_author_df_, size=1000), n_jobs=12) processed_author_df.to_csv(os.path.join(DATA_PATH, 'processed_author.csv'), index=False)
def parsejson2df(file, col_lst=JSON_KEYWORDS): """ Parse & load json file to Dataframe. Data at https://originalstatic.aminer.cn/misc/dblp.v12.7z ** THIS FILE IS PRETTY LARGE (12GB when unzipped), the file is loaded by readline, HOWEVER THE JSON DATA IS CONVERTED TO A SINGLE DATAFRAME, MAY REQUIRE LARGE MEMORY. ETA 30min """ def parse(js): processed_line = dict() for col in set(col_lst).intersection(set(js.keys())): # Only keywords in the list above are considered # If more keywords are required, according read_in code should be added if col == 'id': processed_line['id'] = js['id'] elif col == 'authors': authors = js['authors'] processed_line['authors_id'] = ';'.join([ str(author.get('id', None)) for author in authors if author.get('id', None) ]) processed_line['authors_name'] = ';'.join([ author.get('name', None) for author in authors if author.get('name', None) ]) processed_line['authors_org'] = ';'.join([ str(author.get('org', None)) for author in authors if author.get('org', None) ]) elif col == 'venue': venue = js['venue'] processed_line['venue_id'] = venue.get('id', None) processed_line['venue_name'] = venue.get('raw', None) elif col == 'year': processed_line['year'] = js['year'] elif col == 'keywords': processed_line['keywords'] = ';'.join(js['keywords']) elif col == 'references': processed_line['references'] = ';'.join( [str(r) for r in js['references']]) elif col == 'n_citation': processed_line['n_cites'] = js['n_citation'] elif col == 'doc_type': processed_line['doc_type'] = js['doc_type'] elif col == 'fos': fos = js['fos'] processed_line['fos_name'] = ';'.join([ str(f.get('name', None)) for f in fos if f.get('name', None) ]) processed_line['fos_weight'] = ';'.join( [str(f.get('w', None)) for f in fos if f.get('w', None)]) return processed_line with open(file, encoding='utf-8') as f: lines = f.readlines() def process(ls): df = pd.DataFrame() for line in ls: line = line.lstrip('[') line = line.lstrip(',') line = line.rstrip(']') line = line.rstrip('\n') js = json.loads(line) df = df.append(parse(js), ignore_index=True) return df parsed_df = multiprocess(process, split_data(lines[1:-1], size=2000)) # Change data type parsed_df['id'] = parsed_df['id'].astype('int64') parsed_df['year'] = parsed_df['year'].astype('int') parsed_df['n_cites'] = parsed_df['n_cites'].astype('int') return parsed_df