def main(): if 'WEIGHTED_SETS' not in dir(mg) and 'UNWEIGHTED_SETS' not in dir(mg): print( 'Define WEIGHTED_SETS and UNWEIGHTED_SETS in manager.py before using this module' ) return TXT_DIR = mg.dir(os.path.join(mg.WORKING_PATH, 'txts')) PDF_DIR = mg.dir(os.path.join(mg.INPUT_PATH, 'pdfs')) data = mg.get_data() convertToText.walkAndText(PDF_DIR, TXT_DIR) find_paragraphs(TXT_DIR, mg.WEIGHTED_SETS + mg.UNWEIGHTED_SETS, outfile=os.path.join(mg.WORKING_PATH, 'paragraphs')) sets = assoc_sets(data, TXT_DIR, mg.WEIGHTED_SETS, less_weighted_sets=mg.UNWEIGHTED_SETS) data['Sets'] = sets.values() mg.update_data()
data = mg.get_data() data = data[data['Data Use'] == 'Y'].dropna(subset=['Authors']) from biblio_reader import scholar_reader types = { auth: 'Contributor' in val for auth, val in scholar_reader.authors(data, 'Contributor').items() } contributors = { author for i, authors in zip(data['i'], data['Authors']) for author in authors.split(' & ') if i in mg.CONTR_PAPERS } with open( os.path.join(mg.dir(os.path.join('data', 'author-links')), 'objects.json'), 'w') as o: json.dump([{ 'name': auth, 'type': "Contributor" if types[auth] is True else 'Not a Contributor', 'depends': list({aff for aff in affils if aff != auth and aff != 'others'}) } for auth, affils in scholar_reader.authors( data, 'Authors', split=' & ').items() if auth != 'others'], o, sort_keys=True, indent=4)
affiliations = ';;'.join( set([aff.text for aff in root.findall('.//Affiliation')])) qualifiers = ';;'.join( set([ qual.text for qual in root.findall('.//MeshHeading/QualifierName') ]).union( set([ key.text for key in root.findall('.//KeywordList/Keyword') ]))) parsed.append( (int(bib.replace('.xml', '')), authors, affiliations, qualifiers)) parsed_data = pd.DataFrame( parsed, columns=['i', 'authors', 'affiliations', 'qualifiers']) parsed_data.sort_values('i', inplace=True) parsed_data.to_csv(path_or_buf=outfile, index=False) if __name__ == '__main__': BIB_DIR = mg.dir(os.path.join(mg.WORKING_PATH, 'bibs')) PARSED_BIBS = os.path.join(mg.WORKING_PATH, 'parsed_bibs.csv') if 'PMCID' not in data: get_ids(data) mg.update_data() if not os.path.exists(BIB_DIR): write_bib(data, mg.dir(BIB_DIR)) parse_bib(BIB_DIR, PARSED_BIBS)
import random, os, collections, math, manager as mg checker_dir = mg.dir(os.path.join(mg.WORKING_PATH, 'reviewer_assigns')) data = mg.get_data() paragraphs = mg.get_paragraphs() class Member(object): def __init__(self, name, path): """ The Member class represents a reviewer of the articles :param name: Reviewer ID :param path: Path of reviewer text file """ self.name = name self.path = '/'.join([path, name + '.txt']) if os.path.exists(self.path): file = open(self.path) if len(file.readlines()) < 4: self.articles = [] else: for i, line in enumerate(file): if i == 3: self.articles = sorted( [int(l) for l in str(line).strip().split(',')]) break else: self.articles = [] self.written = list(self.articles) def __str__(self): """
import pandas as pd, matplotlib.pyplot as plt, manager as mg, os, datetime, collections, numpy as np from titlecase import titlecase STAT_DIR = mg.dir(os.path.join(mg.OUTPUT_PATH, 'stats')) def count_visualizer(value_count, stat_type, name, row_limit=None, color=None): """ Counts values of specific columns in dataframe :param value_count: A value counts series, dict, or LOT (see pandas value_count function) :param out: output file name :param stat_type: one of: bar, barh, pie, plot :param row_limit: Sets a limit to how many highest values should be counted :return: csv, bar, or pie file """ value_count = {value: count for value, count in list(dict(value_count).items())[:row_limit]} plt.figure() if stat_type == 'bar': plt.bar(range(len(value_count)), list(value_count.values()), align='center', color=color) plt.xticks(range(len(value_count)), value_count.keys()) elif stat_type == 'barh': plt.barh(range(len(value_count)), list(value_count.values()), align='center', tick_label=value_count.keys(), color=color) elif stat_type == 'pie': plt.pie(list(value_count.values()), labels=value_count.keys(), autopct='%1.1f%%', shadow=True) plt.axis('equal') elif stat_type == 'plot': plt.plot(list(value_count.values()), color=color) plt.xticks([0, int(len(value_count) / 2), len(value_count)], [list(value_count.keys())[0][1], list(value_count.keys())[int(len(value_count) / 2)][1], list(value_count.keys())[len(value_count) - 1][1]]) plt.fill_between(range(len(value_count)), list(value_count.values()))
import manager as mg import json, re, os from urllib import request as req from unidecode import unidecode bibs = mg.get_bibs().dropna(subset=['affiliations']) map_dir = mg.dir(os.path.join(mg.INPUT_PATH, 'map_tools')) API = 'AIzaSyCc3U_YDbluAh_Eja8Zc4e4PX04ndyDXgE' bibs['affiliations'] = bibs['affiliations'].apply( lambda aff: re.sub('\[?\d\]', ';', aff)) affiliations = { i: { aff.strip() for sublist in [affil.split(';') for affil in affiliation.split(';;')] for aff in sublist } for i, affiliation in zip(bibs['i'], bibs['affiliations']) } def repair_affils(affiliations): aff_dict = dict() substitutions = [ re.compile('\s\([^(]*\)'), re.compile('\s*(Electronic address:\s*)*\S+@\S+'), re.compile('\A[^a-zA-Z]+'), re.compile('\s*[\.,]\Z'), re.compile('Email:.*'), re.compile('\Aand\s*'), re.compile(',?\s+and\Z'), re.compile('tel:.*|.*affiliated.*|To whom.*')
import manager as mg, os, sys, csv, collections from biblio_reader import scholar_reader data = mg.get_data() checks = mg.dir(os.path.join(mg.INPUT_PATH, 'article_review')) categories = mg.dir(os.path.join(mg.INPUT_PATH, 'journal_categories')) if len(os.listdir(checks)) == 0: print('No validity checks to analyze') sys.exit(1) def usage_directory(directory): """ After manually looking at each publication and marking correctly, takes the marks from the csv directory and makes sure each are double checked for accuracy. :param directory: The directory of csv files of manual checks :return: A dictionary of the publication numbers and their validity """ checks = {} for check in os.listdir(directory): if '.csv' not in check: continue full_path = '/'.join([directory, check]) with open(full_path, 'r') as f: reader = list(csv.reader(f)) for rows in reader[1:]: if rows[1] != '': k = int(rows[0]) v = rows[1].replace(' and ', '').upper() if k not in checks: checks[k] = [v] else: