Example #1
0
def get_self_citations_count(recids, algorithm='simple',
                                  precompute=CFG_BIBRANK_SELFCITES_PRECOMPUTE):
    """Depending on our site we config, we either:
    * compute self-citations (using a simple algorithm)
    * or fetch self-citations from pre-computed table"""
    total_cites = 0

    if not precompute:
        tags = get_authors_tags()
        selfcites_fun = ALL_ALGORITHMS[algorithm]

        for recid in recids:
            citers = get_cited_by(recid)
            self_cites = selfcites_fun(recid, tags)
            total_cites += len(citers) - len(self_cites)
    else:
        results = get_precomputed_self_cites_list(recids)

        results_dict = {}
        for r in results:
            results_dict[r[0]] = r[1]

        for r in recids:
            citers = get_cited_by(r)
            self_cites = results_dict.get(r, 0)
            total_cites += len(citers) - self_cites

    return total_cites
Example #2
0
def get_self_citations_count(recids,
                             algorithm='simple',
                             precompute=CFG_BIBRANK_SELFCITES_PRECOMPUTE):
    """Depending on our site we config, we either:
    * compute self-citations (using a simple algorithm)
    * or fetch self-citations from pre-computed table"""
    total_cites = 0

    if not precompute:
        tags = get_authors_tags()
        selfcites_fun = ALL_ALGORITHMS[algorithm]

        for recid in recids:
            citers = get_cited_by(recid)
            self_cites = selfcites_fun(recid, tags)
            total_cites += len(citers) - len(self_cites)
    else:
        results = get_precomputed_self_cites_list(recids)

        results_dict = {}
        for r in results:
            results_dict[r[0]] = r[1]

        for r in recids:
            citers = get_cited_by(r)
            self_cites = results_dict.get(r, 0)
            total_cites += len(citers) - self_cites

    return total_cites
Example #3
0
def find_citations(rank_method_code, recID, hitset, verbose):
    """Rank by the amount of citations."""
    #calculate the cited-by values for all the members of the hitset
    #returns: ((recordid,weight),prefix,postfix,message)

    global voutput
    voutput = ""

    #If the recID is numeric, return only stuff that cites it. Otherwise return
    #stuff that cites hitset

    #try to convert to int
    recisint = True
    recidint = 0
    try:
        recidint = int(recID)
    except:
        recisint = False
    ret = []
    if recisint:
        myrecords = get_cited_by(recidint) #this is a simple list
        ret = get_cited_by_weight(myrecords)
    else:
        ret = get_cited_by_weight(hitset)
    ret.sort(lambda x,y:cmp(x[1],y[1]))      #ascending by the second member of the tuples

    if verbose > 0:
        voutput = voutput+"\nrecID "+str(recID)+" is int: "+str(recisint)+" hitset "+str(hitset)+"\n"+"find_citations retlist "+str(ret)

    #voutput = voutput + str(ret)

    if ret:
        return (ret,"(", ")", "")
    else:
        return ((),"", "", "")
Example #4
0
    def related_records(recids, recids_processed):
        if fmt == "HDREF" and recids:
            # HDREF represents the references tab
            # the tab needs to be recomputed not only when the record changes
            # but also when one of the citations changes
            sql = """SELECT id, modification_date FROM bibrec
                     WHERE id in (%s)""" % ','.join(str(r) for r in recids)

            def check_date(mod_date):
                return mod_date.strftime(
                    "%Y-%m-%d %H:%M:%S") < latest_bibrank_run

            rel_recids = intbitset([
                recid for recid, mod_date in run_sql(sql)
                if check_date(mod_date)
            ])
            for r in rel_recids:
                recids |= intbitset(get_cited_by(r))

        # To not process recids twice
        recids -= recids_processed
        # Adds to the set of processed recids
        recids_processed += recids

        return recids
Example #5
0
def calculate_citation_history_coordinates(recid):
    """Return a list of citation graph coordinates for RECID, sorted by year."""
    result = {}
    for year in calculate_citation_graphe_x_coordinates(recid):
        result[year] = 0

    if len(result) < CFG_BIBRANK_CITATION_HISTORY_MIN_X_POINTS:
        # do not generate graphs that have less than X points
        return []

    for recid in get_cited_by(recid):
        rec_date = get_record_year(recid)
        # Some records simlpy do not have these fields
        if rec_date:
            # Maybe rec_date[0][0:4] has a typo and cannot
            # be converted to an int
            try:
                d = strptime(rec_date[0][:4], '%Y')
            except ValueError:
                pass
            else:
                if d.year in result:
                    result[d.year] += 1

    return sorted(result.iteritems())
Example #6
0
def compute_self_citations(recid, tags, authors_fun):
    """Compute the self-citations

    We return the total numbers of citations minus the number of self-citations
    Args:
     - recid: record id
     - lciters: list of record ids citing this record
     - authors_cache: the authors cache which will be used to store an author
                      friends (to not compute friends twice)
     - tags: the tag number for author, coauthors, collaborations,
             required since it depends on how the marc was defined
    """
    citers = get_cited_by(recid)
    if not citers:
        return set()

    self_citations = set()

    authors = frozenset(get_authors_from_record(recid, tags))

    collaborations = None
    if not authors or len(authors) > 20:
        collaborations = frozenset(
            get_collaborations_from_record(recid, tags))

    if collaborations:
        # Use collaborations names
        for cit in citers:
            cit_collaborations = frozenset(
                get_collaborations_from_record(cit, tags))
            if collaborations.intersection(cit_collaborations):
                self_citations.add(cit)
    else:
        # Use authors names
        for cit in citers:
            cit_authors = get_authors_from_record(cit, tags)
            if (not authors or len(cit_authors) > 20) and \
                get_collaborations_from_record(cit, tags):
                # Record from a collaboration that cites
                # a record from an author, it's fine
                pass
            else:
                cit_coauthors = frozenset(authors_fun(cit, tags))
                if authors.intersection(cit_coauthors):
                    self_citations.add(cit)

    return self_citations
Example #7
0
def compute_self_citations(recid, tags, authors_fun):
    """Compute the self-citations

    We return the total numbers of citations minus the number of self-citations
    Args:
     - recid: record id
     - lciters: list of record ids citing this record
     - authors_cache: the authors cache which will be used to store an author
                      friends (to not compute friends twice)
     - tags: the tag number for author, coauthors, collaborations,
             required since it depends on how the marc was defined
    """
    citers = get_cited_by(recid)
    if not citers:
        return set()

    self_citations = set()

    authors = frozenset(get_authors_from_record(recid, tags))

    collaborations = None
    if not authors or len(authors) > 20:
        collaborations = frozenset(get_collaborations_from_record(recid, tags))

    if collaborations:
        # Use collaborations names
        for cit in citers:
            cit_collaborations = frozenset(
                get_collaborations_from_record(cit, tags))
            if collaborations.intersection(cit_collaborations):
                self_citations.add(cit)
    else:
        # Use authors names
        for cit in citers:
            cit_authors = get_authors_from_record(cit, tags)
            if (not authors or len(cit_authors) > 20) and \
                get_collaborations_from_record(cit, tags):
                # Record from a collaboration that cites
                # a record from an author, it's fine
                pass
            else:
                cit_coauthors = frozenset(authors_fun(cit, tags))
                if authors.intersection(cit_coauthors):
                    self_citations.add(cit)

    return self_citations
Example #8
0
def citations_nb_counts():
    """Get number of citations for the record `recid`."""
    recid = request.view_args.get('recid')
    if recid is None:
        return

    from intbitset import intbitset
    from invenio.legacy.bibrank.citation_searcher import (get_cited_by,
                                                          get_cited_by_count)

    if CFG_BIBRANK_SHOW_CITATION_LINKS:
        if CFG_INSPIRE_SITE:
            from invenio.legacy.search_engine import search_unit
            citers_recids = intbitset(get_cited_by(recid))
            citeable_recids = search_unit(p='citeable', f='collection')
            return len(citers_recids & citeable_recids)
        else:
            return get_cited_by_count(recid)
    return 0
Example #9
0
def citations_nb_counts():
    """Get number of citations for the record `recid`."""
    recid = request.view_args.get("recid")
    if recid is None:
        return

    from intbitset import intbitset
    from invenio.legacy.bibrank.citation_searcher import get_cited_by, get_cited_by_count

    if CFG_BIBRANK_SHOW_CITATION_LINKS:
        if CFG_INSPIRE_SITE:
            from invenio.legacy.search_engine import search_unit

            citers_recids = intbitset(get_cited_by(recid))
            citeable_recids = search_unit(p="citeable", f="collection")
            return len(citers_recids & citeable_recids)
        else:
            return get_cited_by_count(recid)
    return 0
Example #10
0
    def related_records(recids, recids_processed):
        if fmt == "HDREF" and recids:
            # HDREF represents the references tab
            # the tab needs to be recomputed not only when the record changes
            # but also when one of the citations changes
            sql = """SELECT id, modification_date FROM bibrec
                     WHERE id in (%s)""" % ','.join(str(r) for r in recids)

            def check_date(mod_date):
                return mod_date.strftime(
                    "%Y-%m-%d %H:%M:%S") < latest_bibrank_run
            rel_recids = intbitset([recid for recid, mod_date in run_sql(sql)
                                    if check_date(mod_date)])
            for r in rel_recids:
                recids |= intbitset(get_cited_by(r))

        # To not process recids twice
        recids -= recids_processed
        # Adds to the set of processed recids
        recids_processed += recids

        return recids
Example #11
0
def bibreformat_task(fmt, sql, sql_queries, cds_query, process_format, process, recids):
    """
    BibReformat main task

    @param fmt: output format to use
    @param sql: dictionary with pre-created sql queries for various cases (for selecting records). Some of these queries will be picked depending on the case
    @param sql_queries: a list of sql queries to be executed to select records to reformat.
    @param cds_query: a search query to be executed to select records to reformat
    @param process_format:
    @param process:
    @param recids: a list of record IDs to reformat
    @return: None
    """
    write_message("Processing format %s" % fmt)

    t1 = os.times()[4]

    start_date = datetime.now()

### Query the database
###
    task_update_progress('Fetching records to process')
    if process_format:  # '-without' parameter
        write_message("Querying database for records without cache...")
        without_format = without_fmt(sql)

    recIDs = intbitset(recids)

    if cds_query['field']      != "" or  \
       cds_query['collection'] != "" or  \
       cds_query['pattern']    != "":

        write_message("Querying database (CDS query)...")

        if cds_query['collection'] == "":
            # use search_pattern() whenever possible, as it can search
            # even in private collections
            res = search_pattern(p=cds_query['pattern'],
                                 f=cds_query['field'],
                                 m=cds_query['matching'])
        else:
            # use perform_request_search when '-c' argument has been
            # defined, as it is not supported by search_pattern()
            res = intbitset(perform_request_search(req=None, of='id',
                                         c=cds_query['collection'],
                                         p=cds_query['pattern'],
                                         f=cds_query['field']))

        recIDs |= res

    for sql_query in sql_queries:
        write_message("Querying database (%s) ..." % sql_query, verbose=2)
        recIDs |= intbitset(run_sql(sql_query))

    if fmt == "HDREF" and recIDs:
        # HDREF represents the references tab
        # the tab needs to be recomputed not only when the record changes
        # but also when one of the citations changes
        latest_bibrank_run = get_bibrankmethod_lastupdate('citation')
        start_date = latest_bibrank_run
        sql = """SELECT id, modification_date FROM bibrec
                 WHERE id in (%s)""" % ','.join(str(r) for r in recIDs)

        def check_date(mod_date):
            return mod_date < latest_bibrank_run
        recIDs = intbitset([recid for recid, mod_date in run_sql(sql) \
                                                    if check_date(mod_date)])
        for r in recIDs:
            recIDs |= intbitset(get_cited_by(r))

### list of corresponding record IDs was retrieved
### now format the selected records

    if process_format:
        write_message("Records to be processed: %d" % (len(recIDs) \
                                               + len(without_format)))
        write_message("Out of it records without existing cache: %d" % len(without_format))
    else:
        write_message("Records to be processed: %d" % (len(recIDs)))

### Initialize main loop

    total_rec   = 0     # Total number of records
    tbibformat  = 0     # time taken up by external call
    tbibupload  = 0     # time taken up by external call


### Iterate over all records prepared in lists I (option)
    if process:
        if CFG_BIBFORMAT_USE_OLD_BIBFORMAT: # FIXME: remove this
                                            # when migration from php to
                                            # python bibformat is done
            (total_rec_1, tbibformat_1, tbibupload_1) = iterate_over_old(recIDs,
                                                                         fmt)
        else:
            (total_rec_1, tbibformat_1, tbibupload_1) = iterate_over_new(recIDs,
                                                                         fmt)
        total_rec += total_rec_1
        tbibformat += tbibformat_1
        tbibupload += tbibupload_1

### Iterate over all records prepared in list II (no_format)
    if process_format and process:
        if CFG_BIBFORMAT_USE_OLD_BIBFORMAT: # FIXME: remove this
                                            # when migration from php to
                                            # python bibformat is done
            (total_rec_2, tbibformat_2, tbibupload_2) = iterate_over_old(without_format,
                                                                         fmt)
        else:
            (total_rec_2, tbibformat_2, tbibupload_2) = iterate_over_new(without_format,
                                                                         fmt)
        total_rec += total_rec_2
        tbibformat += tbibformat_2
        tbibupload += tbibupload_2

### Store last run time
    if task_has_option("last"):
        write_message("storing run date to %s" % start_date)
        store_last_updated(fmt, start_date)

### Final statistics

    t2 = os.times()[4]

    elapsed = t2 - t1
    message = "total records processed: %d" % total_rec
    write_message(message)

    message = "total processing time: %2f sec" % elapsed
    write_message(message)

    message = "Time spent on external call (os.system):"
    write_message(message)

    message = " bibformat: %2f sec" % tbibformat
    write_message(message)

    message = " bibupload: %2f sec" % tbibupload
    write_message(message)