Beispiel #1
0
from mongo.mongo_provider import MongoProvider
import re
from itertools import zip_longest
import uuid
import csv


raw_authors_collection = MongoProvider().get_authors_collection()
clean_authors_collection = MongoProvider().get_clean_authors_collection()


def get_last_name_to_docs(entries):
    last_name_to_ids = {}
    for entry in entries:
        full_name = entry["name"]
        last_name = full_name.split(",")[0]

        author_docs = last_name_to_ids.get(last_name, [])
        author_docs.append(entry)
        last_name_to_ids[last_name] = author_docs
    
    return last_name_to_ids


def determine_display_name(raw_names):
    display_name = ""
    for name in raw_names:
        if len(name) > len(display_name):
            display_name = name

    return display_name
Beispiel #2
0
from mongo.mongo_provider import MongoProvider
import csv
import vector_utils

division_collection = MongoProvider().get_divisions_collection()
clean_authors_collection = MongoProvider().get_clean_authors_collection()
publications_collection = MongoProvider().get_publications_collection()


def write_data(data, path):
    with open(path, "w") as csvfile:
        writer = csv.writer(csvfile, delimiter=",")
        headers = data[0].keys()
        writer.writerow(headers)
        for entry in data:
            row = [entry[header] for header in headers]
            writer.writerow(row)


def division_top_terms(path, n):
    data = []

    for author_doc in division_collection.find():
        division = author_doc["_id"]
        vector = author_doc["tfidf_vector"]

        for idx, token in enumerate(vector.keys()):
            if idx == n:
                break

            weight = vector.get(token)
Beispiel #3
0
    with open(path, "r", encoding="utf-8-sig") as csvfile:
        reader = csv.reader(csvfile, delimiter="\t")
        headers = []
        for idx, row in enumerate(reader):
            if idx == 0:
                headers = row
            else:
                entry = {
                    get_wos_header_name(header): value
                    for header, value in zip(headers, row)
                }

                entry["_id"] = entry["Accession Number"]

                collection.insert_one(entry)


if __name__ == "__main__":
    # Setup collections to insert raw data
    mongo_provider = MongoProvider()
    collection = mongo_provider.get_wos_collection()

    # Drop collection
    collection.drop()

    data_dir = Path("data/wos")

    print("Inserting entries")
    for data_file in data_dir.glob("**/*"):
        insert_entries(data_file, collection)
    print("Done.")
Beispiel #4
0
from collections import Counter, OrderedDict

import math


def normalize_vector(vector):
    normalized_vector = {}
    norm_length = math.sqrt(sum(weight * weight for weight in vector.values()))
    for token, weight in vector.items():
        normalized_vector[token] = weight / norm_length

    return normalized_vector


if __name__ == "__main__":
    mongo_provider = MongoProvider()
    collection = mongo_provider.get_publications_collection()
    docs = collection.find({}, {"tokens": 1})

    print("Determining document and term frequencies")
    doc_term_freq = {}
    doc_freq = {}
    doc_size = 0
    for doc in docs:
        _id = doc["_id"]
        tokens = doc["tokens"]

        if tokens:
            doc_size += 1
            counter = Counter(tokens)
            total = sum(count for count in counter.values())
Beispiel #5
0
def is_jpl_address(address):
    address_split = [s.strip().lower() for s in address.split(",")]

    for address_part in address_split:
        if address_part in jpl_address_check:
            return True

    return False


def is_caltech_address(address):
    return "CALTECH" in address and not is_jpl_address(address)


if __name__ == "__main__":
    mongo_provider = MongoProvider()
    publication_collection = mongo_provider.get_publications_collection()

    print("Determining author entries")

    author_data_dict = {}
    for idx, doc in enumerate(publication_collection.find()):
        if idx % 1000 == 0:
            print(f"STATUS: {idx}")

        document_id = doc["_id"]
        author_entries = doc["authors"]
        for author_entry in author_entries:
            # Set up data entry for author name
            name = author_entry["name"]
            key = None
Beispiel #6
0
from mongo.mongo_provider import MongoProvider
import csv
import vector_utils

raw_author_collection = MongoProvider().get_authors_collection()
division_collection = MongoProvider().get_divisions_collection()
publications_collection = MongoProvider().get_publications_collection()

address_to_division = {}


def get_divisions(addresses):
    divisions = set()
    for address in addresses:
        divisions_string = address_to_division.get(address, "")
        if divisions_string:
            divs = [div.strip() for div in divisions_string.split(",")]

            for div in divs:
                divisions.add(div)

    return divisions


def populate_address_to_division(path):
    with open(path, "r") as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            address_to_division[row[0]] = row[1]

Beispiel #7
0
from mongo.mongo_provider import MongoProvider
import csv


author_collection = MongoProvider().get_authors_collection()
jpl_address_check = [
    "jet prop lab",
    "4800 oak grove dr",
    "jpl",
    "nasa jet prop lab",
    "jet prop laboratory",
    "jet propulsion lab",
    "jet prop labs",
    "nasa jpl",
    "jet propusl laborotory"
]

def is_jpl_address(address):
    address_split = [s.strip().lower() for s in address.split(",")]

    for address_part in address_split:
        if address_part in jpl_address_check:
            return True

    return False

if __name__ == "__main__":
    output = "data/authors/addresses.csv"

    caltech_addresses = set()
    for doc in author_collection.find():