Esempio n. 1
0
def get_suggestions(**args):
    # initializations
    papers = []
    bibcodes = []
    if "bibcodes" in args:
        bibcodes = args["bibcodes"]
    if len(bibcodes) == 0:
        return []
    # Any overrides for default values?
    if "Nsuggest" in args:
        Nsuggestions = args["Nsuggest"]
    else:
        Nsuggestions = config.BIBUTILS_DEFAULT_SUGGESTIONS
    if "fmt" in args:
        output_format = args["fmt"]
    else:
        output_format = config.BIBUTILS_DEFAULT_FORMAT
    # get rid of potential trailing spaces
    bibcodes = map(lambda a: a.strip(), bibcodes)[: config.BIBUTILS_MAX_INPUT]
    # start processing
    # get the citations for all publications (keeping multiplicity is essential)
    cit_dict = get_citations(bibcodes=bibcodes, threads=config.BIBUTILS_THREADS)
    cits = [item for sublist in cit_dict.values() for item in sublist]
    # clean up cits
    cits = filter(lambda a: len(a) > 0, cits)
    # get references
    refs = get_references(bibcodes=bibcodes)
    # clean up refs
    refs = filter(lambda a: len(a) > 0, refs)
    # removes papers from the original list to get candidates
    papers = filter(lambda a: a not in bibcodes, cits + refs)
    # establish frequencies of papers in results
    paperFreq = [(k, len(list(g))) for k, g in groupby(sorted(papers))]
    # and sort them, most frequent first
    paperFreq = sorted(paperFreq, key=operator.itemgetter(1), reverse=True)
    # remove all papers with frequencies smaller than threshold
    paperFreq = filter(lambda a: a[1] > config.BIBUTILS_THRESHOLD_FREQUENCY, paperFreq)
    # get metadata for suggestions
    meta_dict = get_meta_data(results=paperFreq[:Nsuggestions])
    # return results in required format
    if output_format == "score":
        return [
            {"bibcode": x, "score": y, "title": meta_dict[x]["title"], "author": meta_dict[x]["author"]}
            for (x, y) in paperFreq[:Nsuggestions]
            if x in meta_dict.keys()
        ]
    else:
        return [
            {"bibcode": x, "score": "NA", "title": meta_dict[x]["title"], "author": meta_dict[x]["author"]}
            for (x, y) in paperFreq[:Nsuggestions]
            if x in meta_dict.keys()
        ]
Esempio n. 2
0
def get_attributes(args):
    """
    Gather all data necessary for metrics calculations
    """
    solr_url = config.SOLR_URL
    max_hits = config.METRICS_MAX_HITS
    threads  = config.METRICS_THREADS
    chunk_size = config.METRICS_CHUNK_SIZE
    # Get publication information
    if 'query' in args:
        # If we were fed a query, gather the associated bibcodes
        bibcodes = get_publications_from_query(args['query'])
    elif 'bibcodes' in args:
        bibcodes = map(lambda a: a.strip(), args['bibcodes'])
    elif 'libid' in args:
        # In theory we allow for retrieving bibcodes from private libraries
        # Clearly this will currently not be used
        bibcodes = get_bibcodes_from_private_library(args['libid'])
    # Split the list of bibcodes up in chunks, for parallel processing
    biblists = list(chunks(bibcodes,chunk_size))
    # Gather all publication information into one publication dictionary,
    # keyed on bibcode
    publication_data = get_publication_data(biblists=biblists)
    missing_bibcodes = filter(lambda a: a not in publication_data.keys(), bibcodes)
    app.logger.error("Bibcodes found with missing metadata: %s" % ",".join(missing_bibcodes))
    bibcodes = filter(lambda a: a not in missing_bibcodes, bibcodes)
    # Get citation dictionaries (all, refereed and non-refereed citations in
    # separate dictionaries, so that we don't have to figure this out later)
    (cit_dict,ref_cit_dict,non_ref_cit_dict) = get_citations(bibcodes=bibcodes, pubdata=publication_data, type='metrics')
    # divide by 4 because the values of the dictionary are 4-tuples
    # and the flattening removed all structure.
    Nciting = len(set([x[0] for v in cit_dict.values() for x in v]))
    Nciting_ref = len(set([x[0] for v in ref_cit_dict.values() for x in v]))
    # Now gather all usage data numbers from the MongoDB 'adsdata' collection
    # This info will get stored in the dictionary 'adsdata', also keyed on bibcode
    ads_data = get_mongo_data(bibcodes=bibcodes)
    # Generate the list of document attribute vectors and then
    # sort this list by citations (descending).
    # The attribute vectors will be used to calculate the metrics
    attr_list = make_vectors(bibcodes,publication_data,ads_data,cit_dict,ref_cit_dict,non_ref_cit_dict)
    # We sort the entries in the attribute list on citation count, which
    # will make e.g. the calculation of 'h' trivial
    attr_list = sort_list_of_lists(attr_list,2)

    return attr_list,Nciting,Nciting_ref
Esempio n. 3
0
import utils
import time
import os
from config import *
from tqdm import tqdm

if __name__ == "__main__":
    vocab = utils.get_vocabulary()  # loads vocabulary present in system
    citations = utils.get_citations()  # loads citation counts for doc_ids
    while True:
        print("Enter a word to search the index for:")
        x = input()

        if x in vocab:
            start_time = time.time()
            if os.path.exists(("indexes/inverted_index_" + x + ".pbz2")):
                index = utils.load_index("indexes/inverted_index_" + x)
                loaded = x
            else:
                index = utils.load_index(filename="indexes/inverted_index_" +
                                         x[0])
                loaded = x[0]
            end_time = time.time()
            print(("Took {} seconds to load index " +
                   loaded).format(end_time - start_time))
            print(index[x]["doc_frequency"])  # print number of docs term is in
            for k in list(index[x]["doc_ids"].keys())[:10]:
                print(
                    k, index[x]["doc_ids"][k], citations[k]
                )  # print top 10 docs for term, how many times term in doc, and citations of doc
        else: