Esempio n. 1
0
def main():
    if 'WEIGHTED_SETS' not in dir(mg) and 'UNWEIGHTED_SETS' not in dir(mg):
        print(
            'Define WEIGHTED_SETS and UNWEIGHTED_SETS in manager.py before using this module'
        )
        return
    TXT_DIR = mg.dir(os.path.join(mg.WORKING_PATH, 'txts'))
    PDF_DIR = mg.dir(os.path.join(mg.INPUT_PATH, 'pdfs'))
    data = mg.get_data()
    convertToText.walkAndText(PDF_DIR, TXT_DIR)
    find_paragraphs(TXT_DIR,
                    mg.WEIGHTED_SETS + mg.UNWEIGHTED_SETS,
                    outfile=os.path.join(mg.WORKING_PATH, 'paragraphs'))
    sets = assoc_sets(data,
                      TXT_DIR,
                      mg.WEIGHTED_SETS,
                      less_weighted_sets=mg.UNWEIGHTED_SETS)
    data['Sets'] = sets.values()
    mg.update_data()
Esempio n. 2
0
data = mg.get_data()
data = data[data['Data Use'] == 'Y'].dropna(subset=['Authors'])
from biblio_reader import scholar_reader

types = {
    auth: 'Contributor' in val
    for auth, val in scholar_reader.authors(data, 'Contributor').items()
}
contributors = {
    author
    for i, authors in zip(data['i'], data['Authors'])
    for author in authors.split(' & ') if i in mg.CONTR_PAPERS
}

with open(
        os.path.join(mg.dir(os.path.join('data', 'author-links')),
                     'objects.json'), 'w') as o:
    json.dump([{
        'name':
        auth,
        'type':
        "Contributor" if types[auth] is True else 'Not a Contributor',
        'depends':
        list({aff
              for aff in affils if aff != auth and aff != 'others'})
    } for auth, affils in scholar_reader.authors(
        data, 'Authors', split=' & ').items() if auth != 'others'],
              o,
              sort_keys=True,
              indent=4)
Esempio n. 3
0
        affiliations = ';;'.join(
            set([aff.text for aff in root.findall('.//Affiliation')]))
        qualifiers = ';;'.join(
            set([
                qual.text
                for qual in root.findall('.//MeshHeading/QualifierName')
            ]).union(
                set([
                    key.text for key in root.findall('.//KeywordList/Keyword')
                ])))
        parsed.append(
            (int(bib.replace('.xml', '')), authors, affiliations, qualifiers))
    parsed_data = pd.DataFrame(
        parsed, columns=['i', 'authors', 'affiliations', 'qualifiers'])
    parsed_data.sort_values('i', inplace=True)
    parsed_data.to_csv(path_or_buf=outfile, index=False)


if __name__ == '__main__':

    BIB_DIR = mg.dir(os.path.join(mg.WORKING_PATH, 'bibs'))

    PARSED_BIBS = os.path.join(mg.WORKING_PATH, 'parsed_bibs.csv')

    if 'PMCID' not in data:
        get_ids(data)
        mg.update_data()
    if not os.path.exists(BIB_DIR):
        write_bib(data, mg.dir(BIB_DIR))
    parse_bib(BIB_DIR, PARSED_BIBS)
Esempio n. 4
0
import random, os, collections, math, manager as mg
checker_dir = mg.dir(os.path.join(mg.WORKING_PATH, 'reviewer_assigns'))
data = mg.get_data()
paragraphs = mg.get_paragraphs()


class Member(object):
    def __init__(self, name, path):
        """
        The Member class represents a reviewer of the articles
        :param name: Reviewer ID
        :param path: Path of reviewer text file
        """
        self.name = name
        self.path = '/'.join([path, name + '.txt'])
        if os.path.exists(self.path):
            file = open(self.path)
            if len(file.readlines()) < 4:
                self.articles = []
            else:
                for i, line in enumerate(file):
                    if i == 3:
                        self.articles = sorted(
                            [int(l) for l in str(line).strip().split(',')])
                        break
        else:
            self.articles = []
        self.written = list(self.articles)

    def __str__(self):
        """
Esempio n. 5
0
import pandas as pd, matplotlib.pyplot as plt, manager as mg, os, datetime, collections, numpy as np
from titlecase import titlecase
STAT_DIR = mg.dir(os.path.join(mg.OUTPUT_PATH, 'stats'))


def count_visualizer(value_count, stat_type, name, row_limit=None, color=None):
    """
    Counts values of specific columns in dataframe
    :param value_count: A value counts series, dict, or LOT (see pandas value_count function)
    :param out: output file name
    :param stat_type: one of: bar, barh, pie, plot
    :param row_limit: Sets a limit to how many highest values should be counted
    :return: csv, bar, or pie file
    """
    value_count = {value: count for value, count in list(dict(value_count).items())[:row_limit]}
    plt.figure()
    if stat_type == 'bar':
        plt.bar(range(len(value_count)), list(value_count.values()), align='center', color=color)
        plt.xticks(range(len(value_count)), value_count.keys())
    elif stat_type == 'barh':
        plt.barh(range(len(value_count)), list(value_count.values()), align='center', tick_label=value_count.keys(),
                 color=color)
    elif stat_type == 'pie':
        plt.pie(list(value_count.values()), labels=value_count.keys(), autopct='%1.1f%%', shadow=True)
        plt.axis('equal')
    elif stat_type == 'plot':
        plt.plot(list(value_count.values()), color=color)
        plt.xticks([0, int(len(value_count) / 2), len(value_count)],
                   [list(value_count.keys())[0][1], list(value_count.keys())[int(len(value_count) / 2)][1],
                    list(value_count.keys())[len(value_count) - 1][1]])
        plt.fill_between(range(len(value_count)), list(value_count.values()))
Esempio n. 6
0
import manager as mg
import json, re, os
from urllib import request as req
from unidecode import unidecode

bibs = mg.get_bibs().dropna(subset=['affiliations'])
map_dir = mg.dir(os.path.join(mg.INPUT_PATH, 'map_tools'))
API = 'AIzaSyCc3U_YDbluAh_Eja8Zc4e4PX04ndyDXgE'
bibs['affiliations'] = bibs['affiliations'].apply(
    lambda aff: re.sub('\[?\d\]', ';', aff))
affiliations = {
    i: {
        aff.strip()
        for sublist in [affil.split(';') for affil in affiliation.split(';;')]
        for aff in sublist
    }
    for i, affiliation in zip(bibs['i'], bibs['affiliations'])
}


def repair_affils(affiliations):
    aff_dict = dict()
    substitutions = [
        re.compile('\s\([^(]*\)'),
        re.compile('\s*(Electronic address:\s*)*\S+@\S+'),
        re.compile('\A[^a-zA-Z]+'),
        re.compile('\s*[\.,]\Z'),
        re.compile('Email:.*'),
        re.compile('\Aand\s*'),
        re.compile(',?\s+and\Z'),
        re.compile('tel:.*|.*affiliated.*|To whom.*')
Esempio n. 7
0
import manager as mg, os, sys, csv, collections
from biblio_reader import scholar_reader
data = mg.get_data()
checks = mg.dir(os.path.join(mg.INPUT_PATH, 'article_review'))
categories = mg.dir(os.path.join(mg.INPUT_PATH, 'journal_categories'))
if len(os.listdir(checks)) == 0:
    print('No validity checks to analyze')
    sys.exit(1)


def usage_directory(directory):
    """
    After manually looking at each publication and marking correctly, takes the marks from the csv directory
     and makes sure each are double checked for accuracy.
    :param directory: The directory of csv files of manual checks
    :return: A dictionary of the publication numbers and their validity
    """
    checks = {}
    for check in os.listdir(directory):
        if '.csv' not in check:
            continue
        full_path = '/'.join([directory, check])
        with open(full_path, 'r') as f:
            reader = list(csv.reader(f))
            for rows in reader[1:]:
                if rows[1] != '':
                    k = int(rows[0])
                    v = rows[1].replace(' and ', '').upper()
                    if k not in checks:
                        checks[k] = [v]
                    else: