def generate_list_to_send(search):
    '''
    Generate a list to send to MSNET.
    '''

    filename = 'tmp_' + __file__
    filename = re.sub('.py', '_send.txt', filename)
    output = open(filename,'w')

    recids_nomatch = find_recids_nomatch()

    print search
    result_m = perform_request_search(p=search, cc='HEP')
    print search, len(result_m)
    search = "035__9:msnet"
    result_i = perform_request_search(p=search, cc='HEP')
    search = "0247_2:doi"
    result_d = perform_request_search(p=search, cc='HEP')
    result = intbitset(result_m) & intbitset(result_d) - intbitset(result_i)
    result = result - intbitset(recids_nomatch)
    for recid in result:
        try:
            doi = get_fieldvalues(recid, '0247_a')[0]
        except IndexError:
            print 'Problem with:', recid, doi
            break
        output.write(str(recid) + ',' + doi + '\n')
    output.close()
    print filename
def main():
    counter = 0
    filename = 'ADS_eprints_missing_in_INSPIRE.csv'
    mismatch_filename = ''ADS_eprints_missing_in_INSPIRE_mismatch.csv'
    output = open(filename, 'w')
    mismatch_output = open(mismatch_filename, 'w')
    records = collections.defaultdict(dict)
    search = '0247_2:doi -037__9:arxiv'
    results = perform_request_search(p=search, cc='HEP')
    for r in results:
        doi = get_fieldvalues(r, '0247_a')
        if doi:
            records[r]['doi'] = doi
    eprints = []
    eprint_search = perform_request_search(p='037__9:arxiv', cc='HEP')
    for e in eprint_search:
         eprint = get_eprint_id(e)
         if eprint:
             eprint = eprint.replace('arxiv:', '')
             eprints.append(eprint)
    tree = ET.parse(DOCUMENT)
    root = tree.getroot()
    for child in root:
        if counter < 10:
            if 'doi' and 'preprint_id' in child.attrib:
                found_eprint = check_doi(child.attrib, records, eprints)
                if found_eprint:
                    if found_eprint[0] is True:
                        counter+=1
                        output.write('%s,%s,%s\n' % (found_eprint[0], found_eprint[1], found_eprint[2]))
                    else:                        
                        mismatch_output.write('%s,%s,%s\n' % (found_eprint[0], found_eprint[1], found_eprint[2]))
    output.close()
    print counter
def calculate_index(author):
    '''Calculate the authors indexes.'''

    search = "find ea " + author
    result = perform_request_search(p=search, cc='HEP')
    if len(result) == 0:
        print author, 'has no citations.'
        return None
    citation_list = []
    for recid in result:
        search = 'refersto:recid:' + str(recid)
        citation_list.append(len(perform_request_search(p=search, cc='HEP')))
    citation_list.sort(reverse=True)
    total_citations = 0
    h_index = False
    g_index = False
    for index, value in enumerate(citation_list, 1):
        total_citations += value
        #print '{0:3d} {1:6d} {2:6d} {3:6d}'.format(index, value,
        #                                           total_citations,
        #                                           index*index)
        if index > value and h_index == False:
            h_index = index - 1
        if total_citations > index*index:
            g_index = index
    print '{0:20s} {1:7d} {2:7d}'.format(author, h_index, g_index)
def main(search):
    """This module returns a Google-like result showing the most
       highly cited papers from a given result."""

    all_refs = []
    if not search:
        search = 'standard model'
        search = '"dark matter"'
        search = 'qcd sum rules'
    print 'Your search is', search
    result = perform_request_search(p=search, cc='HEP')
    print 'The result is', len(result)
    for recid in result:
        try:
            search = 'citedby:recid:' + str(recid)
            refs = perform_request_search(p=search, cc='HEP')
            all_refs += refs
        except:
            print 'problem with', recid
    all_refs.sort()
    counted_all_refs = Counter(all_refs)
    sorted_count = sorted(counted_all_refs.items(), key=operator.itemgetter(1))
    for recid_count, count in sorted_count[-10:]:
        url = 'http://inspirehep.net/record/' + str(recid_count)
        print count, url
        title = get_fieldvalues(recid_count, '245__a')[0]
        try:
            author = get_fieldvalues(recid_count, '710__g')[0]
        except:
            try:
                author = get_fieldvalues(recid_count, '100__a')[0]
            except:
                author = 'No Author'
        print '  ', author, ':', title
def eprints():
    total = 0
    fermilab = get_collection_reclist('Fermilab')
    print '{0:4s} {1:3s} {2:3s} {3:3s}'.format('Date', 'All', 'FNA',
                                                   '%')
    date_range = ['1901', '1902', '1903']
    date_range = range(1, 20)
    for yymm in date_range:
        yymm = str(yymm)
        if len(yymm) == 1:
            yymm = '0' + yymm
        search_f = '037__a:fermilab* 037__c:physics.acc-ph 037__a:"arXiv:' + \
                   yymm + '*"'
        search = '037__c:physics.acc-ph 037__a:"arXiv:' + yymm + '*"'
        x = perform_request_search(p=search, cc='HEP')
        search = '037__c:acc-phys 037__a:"acc-phys/' + yymm + '*"'
        y = perform_request_search(p=search, cc='HEP')
        x_f = intbitset(x) & fermilab
        y_f = intbitset(y) & fermilab
        length = len(x) + len(y)
        length_f = len(x_f) + len(y_f)
        try:
            ratio = float(length_f)/float(length)*100.0
        except ZeroDivisionError:
            ratio = 0
        print '{0:4s} {1:3d} {2:3d} {3:3f}'.format(yymm, length, length_f, 
                                                   ratio)
        total += length
    print "Total =", total
def _get_coauthors_fallback(personid, collabs):
    # python 2.4 does not supprt max() with key argument.
    # Please remove this function when python 2.6 is supported.
    def max_key(iterable, key):
        try:
            ret = iterable[0]
        except IndexError:
            return None
        for i in iterable[1:]:
            if key(i) > key(ret):
                ret = i
        return ret

    if collabs:
        query = 'exactauthor:"%s" and (%s)' % (personid, ' or '.join([('collaboration:"%s"' % x) for x in zip(*collabs)[0]]))
        exclude_recs = perform_request_search(rg=0, p=query)
    else:
        exclude_recs = []

    recids = perform_request_search(rg=0, p='exactauthor:"%s"' % str(personid))
    recids = list(set(recids) - set(exclude_recs))
    a = format_records(recids, 'WAPAFF')
    a = [pickle.loads(p) for p in a.split('!---THEDELIMITER---!') if p]
    coauthors = {}
    for rec, affs in a:
        keys = affs.keys()
        for n in keys:
            try:
                coauthors[n].add(rec)
            except KeyError:
                coauthors[n] = set([rec])

    coauthors = [(x, x, len(coauthors[x])) for x in coauthors if x.lower() != personid.lower()]
    return coauthors
def _get_hepnames_data_fallback(bibauthorid_data, person_id):
    '''
    Returns  hepnames data
    @param bibauthorid_data: dict with 'is_baid':bool, 'cid':canonicalID, 'pid':personid
    '''
    cid = str(person_id)
    hepdict = {}
    if bibauthorid_data['cid']:
        cid = bibauthorid_data['cid']
    hepRecord = perform_request_search(rg=0, cc='HepNames', p=cid)[:CFG_WEBAUTHORPROFILE_MAX_HEP_CHOICES]

    hepdict['cid'] = cid
    hepdict['pid'] = person_id

    if not hepRecord or len(hepRecord) > 1:
        #present choice dialog with alternatives?
        names_dict = get_person_names_dicts(person_id)
        dbnames = names_dict[0]['db_names_dict'].keys()
        query = ' or '.join(['"%s"' % str(n) for n in dbnames])
        additional_records = perform_request_search(rg=0, cc='HepNames', p=query)[:CFG_WEBAUTHORPROFILE_MAX_HEP_CHOICES]
        hepRecord += additional_records
        hepdict['HaveHep'] = False
        hepdict['HaveChoices'] = bool(hepRecord)
        #limits possible choiches!
        hepdict['HepChoices'] = [(format_record(x, 'hb'), x) for x in hepRecord ]
        hepdict['heprecord'] = hepRecord
        hepdict['bd'] = bibauthorid_data
    else:
        #show the heprecord we just found.
        hepdict['HaveHep'] = True
        hepdict['HaveChoices'] = False
        hepdict['heprecord'] = format_record(hepRecord[0], 'hd')
        hepdict['bd'] = bibauthorid_data
    return hepdict
def find_records():
    '''Looks for candidate records.'''
    search = "find fc g not fc m not fc t and tc p and jy " + str(YEAR)
    result_m = perform_request_search(p=search, cc='HEP')
    search = "035__9:msnet"
    result_i = perform_request_search(p=search, cc='HEP')
    result = intbitset(result_m) - intbitset(result_i)
    return result
def get_institution_ids(text):
    # HACK: I know... I am sorry for that. It's for a good cause
    # FIXME: use redis
    global INSTITUTION_CACHE
    if text not in INSTITUTION_CACHE:
        INSTITUTION_CACHE[text] = intbitset(perform_request_search(cc='Institutions', p='110__u:"%s"' % text)) or \
            intbitset(perform_request_search(cc='Institutions', p='110__t:"%s"' % text))
    return INSTITUTION_CACHE[text]
def goto(type, document='', number=0, lang='en', modif=0):
    today = time.strftime('%Y-%m-%d')
    if type == 'SSR':
        ## We would like a CERN Staff Rules and Regulations
        recids = perform_request_search(cc='Staff Rules and Regulations', f="925__a:1996-01-01->%s 925__b:%s->9999-99-99" % (today, today))
        recid = recids[-1]
        reportnumber = get_fieldvalues(recid, '037__a')[0]
        edition = int(reportnumber[-2:]) ## e.g. CERN-STAFF-RULES-ED08
        return BibRecDocs(recid).get_bibdoc(make_cern_ssr_docname(lang, edition, modif)).get_file('.pdf').get_url()
    elif type == "OPER-CIRC":
        recids = perform_request_search(cc="Operational Circulars", p="reportnumber=\"CERN-OPER-CIRC-%s-*\"" % number, sf="925__a")
        recid = recids[-1]
        documents = {}
        bibrecdocs = BibRecDocs(recid)
        for docname in bibrecdocs.get_bibdoc_names():
            ldocname = docname.lower()
            if 'implementation' in ldocname:
                _register_document(documents, docname, 'implementation_en')
            elif 'application' in ldocname:
                _register_document(documents, docname, 'implementation_fr')
            elif 'archiving' in ldocname:
                _register_document(documents, docname, 'archiving_en')
            elif 'archivage' in ldocname:
                _register_document(documents, docname, 'archiving_fr')
            elif 'annexe' in ldocname or 'annexes_fr' in ldocname:
                _register_document(documents, docname, 'annex_fr')
            elif 'annexes_en' in ldocname or 'annex' in ldocname:
                _register_document(documents, docname, 'annex_en')
            elif '_en_' in ldocname or '_eng_' in ldocname or '_angl_' in ldocname:
                _register_document(documents, docname, 'en')
            elif '_fr_' in ldocname:
                _register_document(documents, docname, 'fr')
        return bibrecdocs.get_bibdoc(documents[document]).get_file('.pdf').get_url()
    elif type == 'ADMIN-CIRC':
        recids = perform_request_search(cc="Administrative Circulars", p="reportnumber=\"CERN-ADMIN-CIRC-%s-*\"" % number, sf="925__a")
        recid = recids[-1]
        documents = {}
        bibrecdocs = BibRecDocs(recid)
        for docname in bibrecdocs.get_bibdoc_names():
            ldocname = docname.lower()
            if 'implementation' in ldocname:
                _register_document(documents, docname, 'implementation-en')
            elif 'application' in ldocname:
                _register_document(documents, docname, 'implementation-fr')
            elif 'archiving' in ldocname:
                _register_document(documents, docname, 'archiving-en')
            elif 'archivage' in ldocname:
                _register_document(documents, docname, 'archiving-fr')
            elif 'annexe' in ldocname or 'annexes_fr' in ldocname:
                _register_document(documents, docname, 'annex-fr')
            elif 'annexes_en' in ldocname or 'annex' in ldocname:
                _register_document(documents, docname, 'annex-en')
            elif '_en_' in ldocname or '_eng_' in ldocname or '_angl_' in ldocname:
                _register_document(documents, docname, 'en')
            elif '_fr_' in ldocname:
                _register_document(documents, docname, 'fr')
        return bibrecdocs.get_bibdoc(documents[document]).get_file('.pdf').get_url()
 def test_fin_to_find_trans(self):
     """SPIRES search syntax - fin a ellis, j == find a ellis, j"""
     fin_search = "fin a ellis, j"
     fin_result = perform_request_search(p=fin_search)
     find_search = "find a ellis, j"
     find_result = perform_request_search(p=find_search)
     # We don't care if results are [], as long as they're the same
     # Uncovered corner case: parsing could be broken and also happen to
     # return [] twice.  Unlikely though.
     self.assertEqual(fin_result, find_result)
Example #12
0
def get_kbd_values(kbname, searchwith=""):
    """Return a list of values by searching a dynamic kb.

    @param kbname:     name of the knowledge base
    @param searchwith: a term to search with
    """
    from invenio import search_engine

    #first check that the kb in question is dynamic
    kbid = bibknowledge_dblayer.get_kb_id(kbname)
    if not kbid:
        return []
    kbtype = bibknowledge_dblayer.get_kb_type(kbid)
    if not kbtype:
        return []
    if kbtype != 'd':
        return []
    #get the configuration so that we see what the field is
    confdict = bibknowledge_dblayer.get_kb_dyn_config(kbid)
    if not confdict:
        return []
    if 'field' not in confdict:
        return []
    field = confdict['field']
    expression = confdict['expression']
    collection = ""
    if 'collection' in confdict:
        collection = confdict['collection']
    reclist = []  # return this
    if searchwith and expression:
        if (expression.count('%') > 0):
            expression = expression.replace("%", searchwith)
            reclist = search_engine.perform_request_search(p=expression,
                                                           cc=collection)
        else:
            #no %.. just make a combination
            expression = expression + " and " + searchwith
            reclist = search_engine.perform_request_search(p=expression,
                                                           cc=collection)
    else:  # either no expr or no searchwith.. but never mind about searchwith
        if expression:  # in this case: only expression
            reclist = search_engine.perform_request_search(p=expression,
                                                           cc=collection)
        else:
            #make a fake expression so that only records that have this field
            #will be returned
            fake_exp = "/.*/"
            if searchwith:
                fake_exp = searchwith
            reclist = search_engine.perform_request_search(f=field, p=fake_exp,
                                                           cc=collection)
    if reclist:
        return [val for (val, dummy) in \
            search_engine.get_most_popular_field_values(reclist, field)]
    return []  # in case nothing worked
 def test_irn_processing(self):
     """SPIRES search syntax - find irn 1360337 == find irn SPIRES-1360337"""
     # Added for trac-130
     with_spires = "fin irn SPIRES-1360337"
     with_result = perform_request_search(p=with_spires)
     without_spires = "fin irn 1360337"
     without_result = perform_request_search(p=without_spires)
     # We don't care if results are [], as long as they're the same
     # Uncovered corner case: parsing could be broken and also happen to
     # return [] twice.  Unlikely though.
     self.assertEqual(with_result, without_result)
Example #14
0
 def match(self, query=None, **kwargs):
     """Try to match the current record to the database."""
     from invenio.search_engine import perform_request_search
     if not query:
         # We use default setup
         recid = self.record["001"][0][3]
         return perform_request_search(p="035:%s" % (recid,),
                                       of="id")
     else:
         if "recid" not in kwargs:
             kwargs["recid"] = self.record["001"][0][3]
         return perform_request_search(p=query % kwargs,
                                       of="id")
def job_stats():
    grand_total = 0
    print "{0:16s} {1:5s} {2:5s} {3:5s}".format('search', 'open', 
                                                'closed', 'total')
    for month in range(1,4):
        if month < 10:
            month = '0' +  str(month)
        search = 'dadd:2019-' + str(month)
        x = perform_request_search(p=search, cc='Jobs')
        y = perform_request_search(p=search, cc='Jobs Hidden')
        total = len(x+y)
        grand_total += total
        print "{0:20s} {1:5d} {2:5d} {3:5d} {4:5d}".format(search,
            len(x), len(y), total, grand_total)
Example #16
0
def get_reference_number(tarball):
    '''
    Attempts to determine the reference number of the file by searching.

    @param: tarball (string): the name of the tarball as downloaded from
        arXiv

    @return: refno (string): the reference number of the paper
    '''

    # we just need the name of the file
    tarball = os.path.split(tarball)[1]

    # the name right now looks like arXiv:hep-ph_9703009
    # or arXiv:0910.0476
    if tarball.startswith(ARXIV_HEADER):
        tarball = tarball.split(':')[1]
        if len(tarball.split('_')) > 1:
            arXiv_record = tarball.replace('_', '/')
        else:
            arXiv_record = tarball

        result = perform_request_search(p=arXiv_record, f='reportnumber')

        if len(result) == 0:
            return tarball

        return str(result[0])

    arXiv_record = re.findall('(([a-zA-Z\\-]+/\\d+)|(\\d+\\.\\d+))', tarball)
    if len(arXiv_record) > 1:
        arXiv_record = arXiv_record[0]
        result = perform_request_search(p=arXiv_record, f='reportnumber')

        if len(result) > 0:
            return str(result[0])

    tarball_mod = tarball.replace('_', '/')
    arXiv_record = re.findall('(([a-zA-Z\\-]+/\\d+)|(\\d+\\.\\d+))',\
                              tarball_mod)
    if len(arXiv_record) > 1:
        arXiv_record = arXiv_record[0]
        result = perform_request_search(p=arXiv_record, f='reportnumber')

        if len(result) > 0:
            return str(result[0])

    return tarball
Example #17
0
def get_list():
    papers = []
    prev_version = perform_request_search()

    for recid in prev_version:
        rec = get_record(recid)
        doi = None
        arxiv_id = None
        try:
            if ('2', 'DOI') in rec['024'][0][0]:
                for t in rec['024'][0][0]:
                    if 'a' in t:
                        doi = t[1]
                if not doi:
                    print "No DOI for record: %i" % (recid, )
            else:
                print "No DOI for record: %i" % (recid, )
        except:
            print "No DOI for record: %i" % (recid, )

        checksum, url, url_type = get_pdf(recid)

        if '037' in rec.keys():
            if ('9', 'arXiv') in rec.get('037')[0][0]:
                for t in rec.get('037')[0][0]:
                    if 'a' in t:
                        arxiv_id = t[1]

        papers.append((recid, arxiv_id, get_creation_date(recid), checksum, url, url_type, doi))
    return papers
def find_records_containing_email():
    """
    Searches for HEP records with emails
    """

    #emails = set()
    #recids = set()
    #for email in EMAILS_HEP:
    #    if email not in EMAILS_HEPNAMES:
    #        continue
    #    if email.startswith('email'):
    #        continue
    #    emails.add(email)
    #    search = "100__m:{0} or 700__m:{0}".format(email)
    #    result = perform_request_search(p=search, cc='HEP')
    #    if len(result) > 1:
    #        recids.update(result)
    #print recids
    #quit()

    search = r'100__m:/\@/ or 700__m:/\@/ \
                - \
               100__m:email* - 700__m:email*'
    search = r'100__m:/\w/ or 700__m:/\w/'
    if SEARCH:
        search = SEARCH
    result = perform_request_search(p=search, cc='HEP')
    print "Checking", len(result), "records"
    return sorted(result, reverse=True)
def get_matched_id(subfields):
    citation_element = reference2citation_element(subfields)
    if "doi_string" in citation_element:
        recids = find_doi(citation_element)
        if len(recids) == 1:
            return recids.pop()
    if "journal_title" in citation_element and "year" in citation_element:
        recids = find_journal(citation_element)
        if len(recids) == 1:
            return recids.pop()
    if "pubnote" in citation_element:
        recids = perform_request_search(p=citation_element["pubnote"], f="journal")
        if len(recids) == 1:
            return recids.pop()
    if "report_num" in citation_element:
        recids = find_reportnumber(citation_element)
        if len(recids) == 1:
            return recids.pop()
    if "ISBN" in citation_element:
        recids = find_isbn(citation_element)
        if len(recids) == 1:
            return recids.pop()
    # if 'title' in citation_element:
    # recids = find_book(citation_element)
    # if len(recids) == 1:
    # return recids.pop()
    return None
def _submit_changes_to_bibupload(search_criteria, update_commands, upload_mode, tag_list, collection, req):
    """This methods takes care of submitting the changes to the server
    through bibupload.

    @param search_criteria: the search criteria used for filtering the
    records. The changes will be applied to all the records matching
    the criteria

    @param update_commands: the commands defining the changes. These
    commands perform the necessary changes before the records are submitted
    """
    if collection == "Any collection":
        collection = ""
    record_IDs = search_engine.perform_request_search(p=search_criteria, c=collection)
    num_records = len(record_IDs)

    updated_records = []

    for current_id in record_IDs:
        current_updated_record = _get_updated_record(current_id, update_commands)
        updated_records.append(current_updated_record)

    file_path = _get_file_path_for_bibupload()
    _save_records_xml(updated_records, file_path, upload_mode, tag_list)
    return _upload_file_with_bibupload(file_path, upload_mode, num_records, req)
def run(query_file):

    fi = open(query_file, 'r')
    queries = filter(len, map(lambda x: x.strip(), fi.readlines()))
    fi.close()

    success = failure = error = 0
    for q in queries:
        print '---'
        print q
        inv_res = len(search_engine.perform_request_search(None, p=q))
        msg = 'NO'
        inv_query = '\t\t'
        try:
            (solr_res, inv_query) = ask_solr(q)
        except Exception, e:
            solr_res = None
            #print e
            msg = 'ER'
            error += 1
            failure -= 1

        print inv_query
        if inv_res == solr_res:
            success += 1
            msg = 'OK'
        else:
            failure += 1


        print "%s  invenio=%s  montysolr=%s" % (msg, inv_res, solr_res)
 def test_get_record(self):
     for recid in perform_request_search(p=""):
         # Our bibrecord we want to test
         record = self.records_cache[recid]
         # Reference implementation
         original_record = get_record_original(recid)
         self.assertXmlEqual(record.to_xml(), print_rec(original_record))
def perform_candidate_record_search(requestType, data):
    """Handle search requests.
    """
    max_results = 999
    too_many = False
    result = {
        'resultCode': 0,
        'resultText': ''
        }
    if requestType == "searchCandidates":
        recids = perform_request_search( p=data['query'] )
        if len(recids) > max_results:
            too_many = True
        else:
            captions = [ search_result_info(x) for x in recids ]
            alternative_titles = [ remove_html_markup(print_record(x, "hs")) for x in recids ]
            search_results = [recids, captions, alternative_titles]
    elif requestType == "searchRevisions":
        revisions = get_record_revision_ids( data['recID1'] )
        captions = [ split_revid(x, 'datetext')[1] for x in revisions ]
        search_results = [revisions, captions]

    if too_many == True:
        result['resultCode'] = 1
        result['resultText'] = 'Too many results'
    else:
        result['results'] = search_results
        result['resultText'] = '%s results' % len(search_results[0])

    return result
Example #24
0
def get_recids_for_set_spec(set_spec):
    """
    Returns the list (as intbitset) of recids belonging to 'set'

    Parameters:

      set_spec - *str* the set_spec for which we would like to get the
                 recids
    """
    recids = intbitset()

    for set_def in get_set_definitions(set_spec):
        new_recids = perform_request_search(c=[coll.strip() \
                                               for coll in set_def['c'].split(',')],
                                            p1=set_def['p1'],
                                            f1=set_def['f1'],
                                            m1=set_def['m1'],
                                            op1=set_def['op1'],
                                            p2=set_def['p2'],
                                            f2=set_def['f2'],
                                            m2=set_def['m2'],
                                            op2=set_def['op2'],
                                            p3=set_def['p3'],
                                            f3=set_def['f3'],
                                            m3=set_def['m3'],
                                            ap=0)

        recids |= intbitset(new_recids)

    return recids
Example #25
0
def fetch_xml_files(folder, els, new_files):
    """Recursively gets the downloaded xml files
    converts them to marc xml format and stores them
    in the same directory with the name "upload.xml"."""
    if exists(folder):
        for subfolder in listdir(folder):
            subfolder = join(folder, subfolder).lstrip()
            if isfile(subfolder):
                if not subfolder.endswith('upload.xml'):
                    folders = subfolder.split('/')
                    folders[-1] = 'upload.xml'
                    file_loc = "/".join(folders)
                    if not exists(file_loc):
                        xmlFile = open(subfolder, "r")
                        xmlString = xmlFile.read()
                        xmlFile.close()
                        dom_xml = xml.dom.minidom.parseString(xmlString)
                        doi = els.get_publication_information(dom_xml)[-1]
                        write_message("DOI in record: %s" % (doi,))
                        res = perform_request_search(p="doi:%s" % (doi,),
                                                     of="id")
                        if not res:
                            write_message("DOI not found")
                            doctype = els.get_doctype(dom_xml).lower()
                            #ignore index pages
                            if doctype in INTERESTING_DOCTYPES:
                                marcfile = open(file_loc, 'w')
                                marcfile.write(els.get_record(subfolder))
                                marcfile.close()
                                new_files.append(file_loc)
                                task_sleep_now_if_required(can_stop_too=False)
                        else:
                            write_message("DOI found: %s" % (res,))
            else:
                fetch_xml_files(subfolder, els, new_files)
Example #26
0
def build_issns_from_local_site():
    """
    Retrieves the ISSNs from the local database.
    Store the "journal name -> issn" relation.

    Normalize journal names a little bit:
        - strip whithespace chars (left and right)
        - all lower case
        - remove "[Online]" suffix

    Print the result as Python dict structure.
    """

    rec_id_list = perform_request_search(cc='Periodicals',
                                         of='id')
    built_issns = {}
    #built_issns = issns # Uncomment this to extend existing issns dict
                         # (e.g. in case of manual addition)
    for rec_id in rec_id_list:
        journal_name_list = get_fieldvalues(rec_id, '210__%')
        issn_list = get_fieldvalues(rec_id, '022__a')
        if issn_list:
            issn = issn_list[0] # There should be only one ISSN
            for journal_name in journal_name_list:
                # Depending on how journal names are entered into the database,
                # you might want to do some processing before saving:
                journal_name = journal_name.lower().strip()
                if journal_name.endswith("[online]"):
                    journal_name = journal_name[:-8].rstrip()

                built_issns[journal_name] = issn

    prtyp = pprint.PrettyPrinter(indent=4)
    prtyp.pprint(built_issns)
def oaigetsysnolist(set="", fromdate="", untildate=""):
    "Returns list of system numbers for the OAI set 'set', modified from 'fromdate' until 'untildate'."
    from invenio.oai_repository_updater import get_set_definitions

    if fromdate != "":
        fromdate = normalize_date(fromdate, "T00:00:00Z")
    else:
        fromdate = get_earliest_datestamp()

    if untildate != "":
        untildate = normalize_date(untildate, "T23:59:59Z")
    else:
        untildate = get_latest_datestamp()

    collections = []
    for set_definition in get_set_definitions(set):
        collections.extend(coll.strip() for coll in set_definition['c'].split(','))
    recids = perform_request_search(f1=CFG_OAI_ID_FIELD, p1="oai:*", m1="e", op1='a',
                                    f2=((set and CFG_OAI_SET_FIELD) or ""), p2=set, m2="e",
                                    d1=utc_to_localtime(fromdate),
                                    d2=utc_to_localtime(untildate),
                                    c=collections,
                                    dt='m',
                                    ap=0)
    ## Let's discard non public records
    return list(intbitset(recids) - get_all_restricted_recids())
def get_hepnames_recid_from_email(email):
    """
    Find the HEPNames recid based on email
    """
   
    if email not in EMAILS_HEPNAMES:
        if VERBOSE:
            print "WARNING: no hepnames record found for %s: " % (email)
        return None

    emailsearch = '371__m:%s or 371__o:%s'
    reclist = perform_request_search(p=emailsearch % (email, email),
                                     cc='HepNames')
    hidden_m = search_unit(email, f='595__m', m='a')
    hidden_o = search_unit(email, f='595__o', m='a')
    reclist_hidden = hidden_m or hidden_o & HN
    reclist = intbitset(reclist) or reclist_hidden

    if len(reclist) == 1:
        return reclist[0]
    elif len(reclist) > 1:
        if VERBOSE:
            print "WARNING: more than one hepnames record found for %s: " \
                  % (email)
            print '\t' + ', '.join([str(r) for r in reclist])
        return [r for r in reclist]
    else:
        if VERBOSE:
            print "WARNING: no hepnames record found for %s: " % (email)
        return None
def get_recid_from_id(id_number):
    search = '035__a:' + id_number
    result = perform_request_search(p=search, cc='HepNames')
    if len(result) == 1:
        return result[0]
    else:
        return None
Example #30
0
def check_records(records, amend_case=False):
    for record in records:
        for position, value in record.iterfields(['100__a', '700__a']):
            value = value.decode('utf8')
            new_value = NAME_CACHE.get(value)
            if new_value is None:
                search_value = value
                if ',' in value:
                    splitted_values = search_value.split(',', 1)
                    search_value = u"%s %s" % (splitted_values[1].strip(), splitted_values[0].strip())
                original_family_name = value.split(',')[0].strip()
                search_value = RE_SPACES.sub(' ', search_value).strip()
                if len(search_value.split()) < 3:
                    # Simple name
                    continue
                i = perform_request_search(p=u'author:"%s"' % search_value, cc='HepNames')
                possible_values = get_fieldvalues(i, '100__a', sort=False) + get_fieldvalues(i, '400__a', sort=False)
                for correct_value in possible_values:
                    correct_value = correct_value.decode('utf8')
                    if search_value.lower().endswith(" " + correct_value.lower().split(',')[0]):
                        family_name = correct_value.split(',')[0].strip()
                        if len(family_name) < len(original_family_name):
                            continue
                        first_name = search_value[:-(len(family_name) + 1)].strip()
                        new_value = u'%s, %s' % (family_name, first_name)
                        NAME_CACHE[value] = new_value
                        break
                else:
                    NAME_CACHE[value] = value
            if new_value:
                if amend_case and new_value == value:
                    continue
                elif new_value.lower() == value.lower():
                    continue
                record.amend_field(position, new_value.encode('utf8'))
Example #31
0
    def __call__(self, req, form):
        argd = wash_search_urlargd(form)

        argd['recid'] = self.recid

        argd['tab'] = self.tab

        if self.format is not None:
            argd['of'] = self.format
        req.argd = argd
        uid = getUid(req)
        if uid == -1:
            return page_not_authorized(
                req,
                "../",
                text="You are not authorized to view this record.",
                navmenuid='search')
        elif uid > 0:
            pref = get_user_preferences(uid)
            try:
                if not form.has_key('rg'):
                    # fetch user rg preference only if not overridden via URL
                    argd['rg'] = int(pref['websearch_group_records'])
            except (KeyError, ValueError):
                pass

        user_info = collect_user_info(req)
        (auth_code,
         auth_msg) = check_user_can_view_record(user_info, self.recid)

        if argd['rg'] > CFG_WEBSEARCH_MAX_RECORDS_IN_GROUPS and acc_authorize_action(
                req, 'runbibedit')[0] != 0:
            argd['rg'] = CFG_WEBSEARCH_MAX_RECORDS_IN_GROUPS

        #check if the user has rights to set a high wildcard limit
        #if not, reduce the limit set by user, with the default one
        if CFG_WEBSEARCH_WILDCARD_LIMIT > 0 and (
                argd['wl'] > CFG_WEBSEARCH_WILDCARD_LIMIT or argd['wl'] == 0):
            if acc_authorize_action(req, 'runbibedit')[0] != 0:
                argd['wl'] = CFG_WEBSEARCH_WILDCARD_LIMIT

        # only superadmins can use verbose parameter for obtaining debug information
        if not isUserSuperAdmin(user_info):
            argd['verbose'] = 0

        if auth_code and user_info['email'] == 'guest':
            cookie = mail_cookie_create_authorize_action(
                VIEWRESTRCOLL, {
                    'collection': guess_primary_collection_of_a_record(
                        self.recid)
                })
            target = CFG_SITE_SECURE_URL + '/youraccount/login' + \
                    make_canonical_urlargd({'action': cookie, 'ln' : argd['ln'], 'referer' : CFG_SITE_SECURE_URL + req.unparsed_uri}, {})
            return redirect_to_url(req, target, norobot=True)
        elif auth_code:
            return page_not_authorized(req, "../", \
                text=auth_msg, \
                navmenuid='search')

        from invenio.search_engine import record_exists, get_merged_recid
        # check if the current record has been deleted
        # and has been merged, case in which the deleted record
        # will be redirect to the new one
        record_status = record_exists(argd['recid'])
        merged_recid = get_merged_recid(argd['recid'])
        if record_status == -1 and merged_recid:
            url = CFG_SITE_URL + '/' + CFG_SITE_RECORD + '/%s?ln=%s'
            url %= (str(merged_recid), argd['ln'])
            redirect_to_url(req, url)
        elif record_status == -1:
            req.status = apache.HTTP_GONE  ## The record is gone!

        # mod_python does not like to return [] in case when of=id:
        out = perform_request_search(req, **argd)
        if out == []:
            return str(out)
        else:
            return out
Example #32
0
    def __call__(self, req, form):
        argd = wash_search_urlargd(form)
        argd['recid'] = self.recid
        if self.format is not None:
            argd['of'] = self.format

        req.argd = argd

        uid = getUid(req)
        user_info = collect_user_info(req)
        if uid == -1:
            return page_not_authorized(
                req,
                "../",
                text="You are not authorized to view this record.",
                navmenuid='search')
        elif uid > 0:
            pref = get_user_preferences(uid)
            try:
                if not form.has_key('rg'):
                    # fetch user rg preference only if not overridden via URL
                    argd['rg'] = int(pref['websearch_group_records'])
            except (KeyError, ValueError):
                pass

        if argd['rg'] > CFG_WEBSEARCH_MAX_RECORDS_IN_GROUPS and acc_authorize_action(
                req, 'runbibedit')[0] != 0:
            argd['rg'] = CFG_WEBSEARCH_MAX_RECORDS_IN_GROUPS

        #check if the user has rights to set a high wildcard limit
        #if not, reduce the limit set by user, with the default one
        if CFG_WEBSEARCH_WILDCARD_LIMIT > 0 and (
                argd['wl'] > CFG_WEBSEARCH_WILDCARD_LIMIT or argd['wl'] == 0):
            if acc_authorize_action(req, 'runbibedit')[0] != 0:
                argd['wl'] = CFG_WEBSEARCH_WILDCARD_LIMIT

        # only superadmins can use verbose parameter for obtaining debug information
        if not isUserSuperAdmin(user_info):
            argd['verbose'] = 0

        record_primary_collection = guess_primary_collection_of_a_record(
            self.recid)

        if collection_restricted_p(record_primary_collection):
            (auth_code, dummy) = acc_authorize_action(
                user_info, VIEWRESTRCOLL, collection=record_primary_collection)
            if auth_code:
                return page_not_authorized(
                    req,
                    "../",
                    text="You are not authorized to view this record.",
                    navmenuid='search')

        # Keep all the arguments, they might be reused in the
        # record page itself to derivate other queries
        req.argd = argd

        # mod_python does not like to return [] in case when of=id:
        out = perform_request_search(req, **argd)
        if out == []:
            return str(out)
        else:
            return out
Example #33
0
def find_records_to_check():
    search = '100__m:/\@/ or 700__m:/\@/ 980:CORE'
    #search = 'find tc t and date 2000->2009'
    #search = 'find cat jkemp or cleggm1 and primarch hep-ex and ac 1'
    #search = 'find fc g or fc l or fc n  and cat luba and date 2010->2014'
    return perform_request_search(p=search, cc='HEP')
Example #34
0
    def __call__(self, req, form):
        """ Perform a search. """
        argd = wash_search_urlargd(form)

        _ = gettext_set_language(argd['ln'])

        if req.method == 'POST':
            raise apache.SERVER_RETURN, apache.HTTP_METHOD_NOT_ALLOWED

        uid = getUid(req)
        user_info = collect_user_info(req)
        if uid == -1:
            return page_not_authorized(
                req,
                "../",
                text=_("You are not authorized to view this area."),
                navmenuid='search')
        elif uid > 0:
            pref = get_user_preferences(uid)
            try:
                if not form.has_key('rg'):
                    # fetch user rg preference only if not overridden via URL
                    argd['rg'] = int(pref['websearch_group_records'])
            except (KeyError, ValueError):
                pass

        if argd['rg'] > CFG_WEBSEARCH_MAX_RECORDS_IN_GROUPS and acc_authorize_action(
                req, 'runbibedit')[0] != 0:
            argd['rg'] = CFG_WEBSEARCH_MAX_RECORDS_IN_GROUPS

        involved_collections = set()
        involved_collections.update(argd['c'])
        involved_collections.add(argd['cc'])

        if argd['id'] > 0:
            argd['recid'] = argd['id']
        if argd['idb'] > 0:
            argd['recidb'] = argd['idb']
        if argd['sysno']:
            tmp_recid = find_record_from_sysno(argd['sysno'])
            if tmp_recid:
                argd['recid'] = tmp_recid
        if argd['sysnb']:
            tmp_recid = find_record_from_sysno(argd['sysnb'])
            if tmp_recid:
                argd['recidb'] = tmp_recid

        if argd['recid'] > 0:
            if argd['recidb'] > argd['recid']:
                # Hack to check if among the restricted collections
                # at least a record of the range is there and
                # then if the user is not authorized for that
                # collection.
                recids = intbitset(xrange(argd['recid'], argd['recidb']))
                restricted_collection_cache.recreate_cache_if_needed()
                for collname in restricted_collection_cache.cache:
                    (auth_code,
                     auth_msg) = acc_authorize_action(user_info,
                                                      VIEWRESTRCOLL,
                                                      collection=collname)
                    if auth_code and user_info['email'] == 'guest':
                        coll_recids = get_collection(collname).reclist
                        if coll_recids & recids:
                            cookie = mail_cookie_create_authorize_action(
                                VIEWRESTRCOLL, {'collection': collname})
                            target = CFG_SITE_SECURE_URL + '/youraccount/login' + \
                                    make_canonical_urlargd({'action': cookie, 'ln' : argd['ln'], 'referer' : CFG_SITE_SECURE_URL + req.unparsed_uri}, {})
                            return redirect_to_url(req, target, norobot=True)
                    elif auth_code:
                        return page_not_authorized(req, "../", \
                            text=auth_msg, \
                            navmenuid='search')
            else:
                involved_collections.add(
                    guess_primary_collection_of_a_record(argd['recid']))

        # If any of the collection requires authentication, redirect
        # to the authentication form.
        for coll in involved_collections:
            if collection_restricted_p(coll):
                (auth_code, auth_msg) = acc_authorize_action(user_info,
                                                             VIEWRESTRCOLL,
                                                             collection=coll)
                if auth_code and user_info['email'] == 'guest':
                    cookie = mail_cookie_create_authorize_action(
                        VIEWRESTRCOLL, {'collection': coll})
                    target = CFG_SITE_SECURE_URL + '/youraccount/login' + \
                            make_canonical_urlargd({'action': cookie, 'ln' : argd['ln'], 'referer' : CFG_SITE_SECURE_URL + req.unparsed_uri}, {})
                    return redirect_to_url(req, target, norobot=True)
                elif auth_code:
                    return page_not_authorized(req, "../", \
                        text=auth_msg, \
                        navmenuid='search')

        #check if the user has rights to set a high wildcard limit
        #if not, reduce the limit set by user, with the default one
        if CFG_WEBSEARCH_WILDCARD_LIMIT > 0 and (
                argd['wl'] > CFG_WEBSEARCH_WILDCARD_LIMIT or argd['wl'] == 0):
            auth_code, auth_message = acc_authorize_action(req, 'runbibedit')
            if auth_code != 0:
                argd['wl'] = CFG_WEBSEARCH_WILDCARD_LIMIT

        # only superadmins can use verbose parameter for obtaining debug information
        if not isUserSuperAdmin(user_info):
            argd['verbose'] = 0

        # Keep all the arguments, they might be reused in the
        # search_engine itself to derivate other queries
        req.argd = argd

        # mod_python does not like to return [] in case when of=id:
        out = perform_request_search(req, **argd)
        if out == []:
            return str(out)
        else:
            return out
Example #35
0
    def __call__(self, req, form):
        """RSS 2.0 feed service."""

        # Keep only interesting parameters for the search
        default_params = websearch_templates.rss_default_urlargd
        # We need to keep 'jrec' and 'rg' here in order to have
        # 'multi-page' RSS. These parameters are not kept be default
        # as we don't want to consider them when building RSS links
        # from search and browse pages.
        default_params.update({
            'jrec': (int, 1),
            'rg': (int, CFG_WEBSEARCH_INSTANT_BROWSE_RSS)
        })
        argd = wash_urlargd(form, default_params)
        user_info = collect_user_info(req)

        for coll in argd['c'] + [argd['cc']]:
            if collection_restricted_p(coll):
                (auth_code, auth_msg) = acc_authorize_action(user_info,
                                                             VIEWRESTRCOLL,
                                                             collection=coll)
                if auth_code and user_info['email'] == 'guest':
                    cookie = mail_cookie_create_authorize_action(
                        VIEWRESTRCOLL, {'collection': coll})
                    target = CFG_SITE_SECURE_URL + '/youraccount/login' + \
                            make_canonical_urlargd({'action': cookie, 'ln' : argd['ln'], 'referer' : CFG_SITE_SECURE_URL + req.unparsed_uri}, {})
                    return redirect_to_url(req, target, norobot=True)
                elif auth_code:
                    return page_not_authorized(req, "../", \
                        text=auth_msg, \
                        navmenuid='search')

        # Create a standard filename with these parameters
        current_url = websearch_templates.build_rss_url(argd)
        cache_filename = current_url.split('/')[-1]

        # In the same way as previously, add 'jrec' & 'rg'

        req.content_type = "application/rss+xml"
        req.send_http_header()
        try:
            # Try to read from cache
            path = "%s/rss/%s.xml" % (CFG_CACHEDIR, cache_filename)
            # Check if cache needs refresh
            filedesc = open(path, "r")
            last_update_time = datetime.datetime.fromtimestamp(
                os.stat(os.path.abspath(path)).st_mtime)
            assert (datetime.datetime.now() < last_update_time +
                    datetime.timedelta(minutes=CFG_WEBSEARCH_RSS_TTL))
            c_rss = filedesc.read()
            filedesc.close()
            req.write(c_rss)
            return
        except Exception, e:
            # do it live and cache

            previous_url = None
            if argd['jrec'] > 1:
                prev_jrec = argd['jrec'] - argd['rg']
                if prev_jrec < 1:
                    prev_jrec = 1
                previous_url = websearch_templates.build_rss_url(
                    argd, jrec=prev_jrec)

            #check if the user has rights to set a high wildcard limit
            #if not, reduce the limit set by user, with the default one
            if CFG_WEBSEARCH_WILDCARD_LIMIT > 0 and (
                    argd['wl'] > CFG_WEBSEARCH_WILDCARD_LIMIT
                    or argd['wl'] == 0):
                if acc_authorize_action(req, 'runbibedit')[0] != 0:
                    argd['wl'] = CFG_WEBSEARCH_WILDCARD_LIMIT

            req.argd = argd
            recIDs = perform_request_search(req,
                                            of="id",
                                            c=argd['c'],
                                            cc=argd['cc'],
                                            p=argd['p'],
                                            f=argd['f'],
                                            p1=argd['p1'],
                                            f1=argd['f1'],
                                            m1=argd['m1'],
                                            op1=argd['op1'],
                                            p2=argd['p2'],
                                            f2=argd['f2'],
                                            m2=argd['m2'],
                                            op2=argd['op2'],
                                            p3=argd['p3'],
                                            f3=argd['f3'],
                                            m3=argd['m3'],
                                            wl=argd['wl'])
            nb_found = len(recIDs)
            next_url = None
            if len(recIDs) >= argd['jrec'] + argd['rg']:
                next_url = websearch_templates.build_rss_url(
                    argd, jrec=(argd['jrec'] + argd['rg']))

            first_url = websearch_templates.build_rss_url(argd, jrec=1)
            last_url = websearch_templates.build_rss_url(argd,
                                                         jrec=nb_found -
                                                         argd['rg'] + 1)

            recIDs = recIDs[-argd['jrec']:(-argd['rg'] - argd['jrec']):-1]

            rss_prologue = '<?xml version="1.0" encoding="UTF-8"?>\n' + \
            websearch_templates.tmpl_xml_rss_prologue(current_url=current_url,
                                                      previous_url=previous_url,
                                                      next_url=next_url,
                                                      first_url=first_url, last_url=last_url,
                                                      nb_found=nb_found,
                                                      jrec=argd['jrec'], rg=argd['rg'],
                                                      cc=argd['cc']) + '\n'
            req.write(rss_prologue)
            rss_body = format_records(recIDs,
                                      of='xr',
                                      ln=argd['ln'],
                                      user_info=user_info,
                                      record_separator="\n",
                                      req=req,
                                      epilogue="\n")
            rss_epilogue = websearch_templates.tmpl_xml_rss_epilogue() + '\n'
            req.write(rss_epilogue)

            # update cache
            dirname = "%s/rss" % (CFG_CACHEDIR)
            mymkdir(dirname)
            fullfilename = "%s/rss/%s.xml" % (CFG_CACHEDIR, cache_filename)
            try:
                # Remove the file just in case it already existed
                # so that a bit of space is created
                os.remove(fullfilename)
            except OSError:
                pass

            # Check if there's enough space to cache the request.
            if len(os.listdir(
                    dirname)) < CFG_WEBSEARCH_RSS_MAX_CACHED_REQUESTS:
                try:
                    os.umask(022)
                    f = open(fullfilename, "w")
                    f.write(rss_prologue + rss_body + rss_epilogue)
                    f.close()
                except IOError, v:
                    if v[0] == 36:
                        # URL was too long. Never mind, don't cache
                        pass
                    else:
                        raise repr(v)
Example #36
0
def query_get_comments(uid,
                       cmtID,
                       recID,
                       reviews,
                       ln,
                       abuse=False,
                       user_collections='',
                       collection=''):
    """
    private function
    @param user_collections: allowed collections for the user
    @param collection: collection to display
    @return tuple of comment where comment is
    tuple (nickname, uid, date_creation, body, id, status) if ranking disabled or
    tuple (nickname, uid, date_creation, body, nb_votes_yes, nb_votes_total, star_score, title, id, status)
    """
    qdict = {
        'id': 0,
        'id_bibrec': 1,
        'uid': 2,
        'date_creation': 3,
        'body': 4,
        'status': 5,
        'nb_abuse_reports': 6,
        'nb_votes_yes': 7,
        'nb_votes_total': 8,
        'star_score': 9,
        'title': 10,
        'email': -2,
        'nickname': -1
    }
    query = """SELECT c.id, c.id_bibrec, c.id_user,
                      DATE_FORMAT(c.date_creation, '%%Y-%%m-%%d %%H:%%i:%%S'), c.body,
                      c.status, c.nb_abuse_reports,
                      %s
                      u.email, u.nickname
               FROM cmtRECORDCOMMENT c LEFT JOIN user u
                                       ON c.id_user = u.id
               %s
               ORDER BY c.nb_abuse_reports DESC, c.nb_votes_yes DESC, c.date_creation
    """
    select_fields = reviews and 'c.nb_votes_yes, c.nb_votes_total, c.star_score, c.title,' or ''
    where_clause = "WHERE " + (reviews and 'c.star_score>0'
                               or 'c.star_score=0')
    if uid:
        where_clause += ' AND c.id_user=%i' % uid
    if recID:
        where_clause += ' AND c.id_bibrec=%i' % recID
    if cmtID:
        where_clause += ' AND c.id=%i' % cmtID
    if abuse:
        where_clause += ' AND c.nb_abuse_reports>0'

    res = run_sql(query % (select_fields, where_clause))
    collection_records = []
    if collection == 'Show all':
        for collection_name in user_collections:
            collection_records.extend(
                perform_request_search(cc=collection_name))
    else:
        collection_records.extend(perform_request_search(cc=collection))
    output = []
    for qtuple in res:
        if qtuple[qdict['id_bibrec']] in collection_records:
            nickname = qtuple[qdict['nickname']] or get_user_info(
                qtuple[qdict['uid']], ln)[2]
            if reviews:
                comment_tuple = (nickname, qtuple[qdict['uid']],
                                 qtuple[qdict['date_creation']],
                                 qtuple[qdict['body']],
                                 qtuple[qdict['nb_votes_yes']],
                                 qtuple[qdict['nb_votes_total']],
                                 qtuple[qdict['star_score']],
                                 qtuple[qdict['title']], qtuple[qdict['id']],
                                 qtuple[qdict['status']])
            else:
                comment_tuple = (nickname, qtuple[qdict['uid']],
                                 qtuple[qdict['date_creation']],
                                 qtuple[qdict['body']], qtuple[qdict['id']],
                                 qtuple[qdict['status']])
            general_infos_tuple = (nickname, qtuple[qdict['uid']],
                                   qtuple[qdict['email']], qtuple[qdict['id']],
                                   qtuple[qdict['id_bibrec']],
                                   qtuple[qdict['nb_abuse_reports']])
            out_tuple = (comment_tuple, general_infos_tuple)
            output.append(out_tuple)
    return tuple(output)
Example #37
0
def query_get_latest(comments, ln, top, user_collections, collection):
    """
    private function
    @param comments:  boolean indicating if we want to retrieve comments or reviews
    @param ln: language
    @param top: number of results to display
    @param user_collections: allowed collections for the user
    @param collection: collection to display
    @return tuple of comment where comment is
    tuple (nickname, uid, date_creation, body, id) if latest comments or
    tuple (nickname, uid, date_creation, body, star_score, id) if latest reviews
    """
    qdict = {
        'id': 0,
        'id_bibrec': 1,
        'uid': 2,
        'date_creation': 3,
        'body': 4,
        'nb_abuse_reports': 5,
        'star_score': 6,
        'nickname': -1
    }
    query = """SELECT c.id, c.id_bibrec, c.id_user,
                      DATE_FORMAT(c.date_creation, '%%Y-%%m-%%d %%H:%%i:%%S'), c.body,
                      c.nb_abuse_reports,
                      %s
                      u.nickname
                      FROM cmtRECORDCOMMENT c LEFT JOIN user u
                      ON c.id_user = u.id
               %s
               ORDER BY c.date_creation DESC
               LIMIT %s
    """
    select_fields = not comments and 'c.star_score, ' or ''
    where_clause = "WHERE " + (
        comments and 'c.star_score=0' or 'c.star_score>0'
    ) + ' AND c.status="ok" AND c.nb_abuse_reports < %s' % CFG_WEBCOMMENT_NB_REPORTS_BEFORE_SEND_EMAIL_TO_ADMIN

    res = run_sql(query % (select_fields, where_clause, top))

    collection_records = []
    if collection == 'Show all':
        for collection_name in user_collections:
            collection_records.extend(
                perform_request_search(cc=collection_name))
    else:
        collection_records.extend(perform_request_search(cc=collection))
    output = []
    for qtuple in res:
        if qtuple[qdict['id_bibrec']] in collection_records:
            nickname = qtuple[qdict['nickname']] or get_user_info(
                qtuple[qdict['uid']], ln)[2]
            if not comments:
                comment_tuple = (nickname, qtuple[qdict['uid']],
                                 qtuple[qdict['date_creation']],
                                 qtuple[qdict['body']],
                                 qtuple[qdict['star_score']],
                                 qtuple[qdict['id']])
            else:
                comment_tuple = (nickname, qtuple[qdict['uid']],
                                 qtuple[qdict['date_creation']],
                                 qtuple[qdict['body']], qtuple[qdict['id']])
            general_infos_tuple = (nickname, qtuple[qdict['uid']],
                                   qtuple[qdict['id']],
                                   qtuple[qdict['id_bibrec']],
                                   qtuple[qdict['nb_abuse_reports']])

            out_tuple = (comment_tuple, general_infos_tuple)
            output.append(out_tuple)
    return tuple(output)
Example #38
0
def bst_fermilab():
    write_message('cd /afs/fnal.gov/files/expwww/bss/html/techpubs')

    for series in SERIES1:
        reports = []
        authorId = False
        search = "find r fermilab-" + series + "-*"
        #search = "find recid 1261432"
        #print search
        result = perform_request_search(p=search, cc='HEP')
        for recid in result:
            #print recid
            reportValues = get_fieldvalues(recid, '037__a')
            author = get_fieldvalues(recid, '100__a')
            authorId = get_fieldvalues(recid, '100__i')
            authorAff = get_fieldvalues(recid, '100__u')
            title = get_fieldvalues(recid, '245__a')
            experiment = get_fieldvalues(recid, '693__e')

            if author:
                author = author[0]
            else:
                author = ''
            if title:
                title = '<i>' + title[0][:100] + '</i>'
            else:
                title = ''
            if experiment:
                experiment = experiment[0]
            else:
                experiment = ''
            if authorAff:
                authorAff = authorAff[0]
            else:
                authorAff = ''
            #print "author = ", author
            #print "title = ", title
            #print "authorId = ", authorId
            #print "experiment = ", experiment
            if authorId:
                authorId = authorId[0]
            for report in reportValues:
                if re.match('FERMILAB-' + series, report, re.IGNORECASE):
                    y = [
                        report,
                        str(recid), author, title, authorId, experiment,
                        authorAff
                    ]
                    #print "y = ", y
                    reports.append(y)
        reports.sort(reverse=True)

        filename = os.path.join(CFG_FERMILAB_PATH,
                                'fermilab-reports-' + series + '.html')
        output = open(filename, 'w')
        output.write(
            '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"\n')
        output.write(
            '          "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">\n')
        output.write('<html xmlns="http://www.w3.org/1999/xhtml">\n')
        output.write('<head>\n')
        output.write('<title>Fermilab Technical Publications: ')
        output.write(escape(series))
        output.write('</title>\n')
        output.write(
            '<meta http-equiv="content-type" content="text/html;charset=utf-8" />\n'
        )
        output.write('</head>\n')
        output.write('<body>\n')
        output.write(
            '<a href="http://bss.fnal.gov/techpubs/fermilab_spires.html">Fermilab Technical Publications</a>\n'
        )
        output.write('<br /><br />')
        dateTimeStamp = '<i>Updated ' + chicago_timezone.fromutc(
            datetime.datetime.utcnow()).strftime(
                '%Y-%m-%d %H:%M:%S') + '</i>\n'
        output.write(dateTimeStamp)
        output.write('<br />\n<table>\n')
        for report in reports:
            #print "report =", report
            if report[4]:
                search2 = '035__a:' + report[4]
                #print "search2 =", search2
                result = perform_request_search(p=search2, cc='HepNames')
                #print report[4], result
                report[2] = '<a href="http://inspirehep.net/record/' + str(
                    result[0]) + '">' + report[2] + '</a>'
            line = '<tr><td><a href="http://inspirehep.net/record/' + report[
                1] + '">' + report[0] + '</a></td>\
                    <td>' + report[2] + '</td><td>' + report[3] + '</td></tr>\n'
            if re.search(r'THESIS', report[0]):
                if report[5]:
                    search2 = '119__a:' + report[5]
                    result = perform_request_search(p=search2,
                                                    cc='Experiments')
                    if result: result = result[0]
                    collaboration = get_fieldvalues(result, '710__g')
                    if collaboration:
                        collaboration = collaboration[0]
                        collaboration = collaboration.replace(
                            ' Collaboration', '')
                        report[5] = report[5] + ' (' + collaboration + ')'
                    if result:
                        report[
                            5] = '<a href="http://inspirehep.net/record/' + str(
                                result) + '">' + report[5] + '</a>'
                line = '<tr><td><a href="http://inspirehep.net/record/' + report[
                    1] + '">' + report[0] + '</a></td>\
                        <td>' + report[2] + '</td><td>' + report[
                        5] + '</td><td>' + report[6] + '</td><td>' + report[
                            3] + '</td></tr>\n'
            output.write(line)
        output.write('</table>\n')
        output.write('</body>\n')
        output.write('</html>\n')
        output.close()
        write_message('\\rm fermilab-reports-' + series + '.html')
        write_message('cp %s .' % filename)

    reports = []
    currentyear = time.strftime('%Y')
    for series in SERIES2:
        #print series
        for year in range(1970, time.localtime()[0] + 1):
            #print year
            dd = str(year)
            dd = re.sub(r"19", "", dd)
            dd = re.sub(r"20", "", dd)
            search = "find r fermilab-" + series + "-" + dd + "*"
            #print search
            result = perform_request_search(p=search, cc='HEP')
            for recid in result:
                reportValues = get_fieldvalues(recid, '037__a')
                author = get_fieldvalues(recid, '100__a')
                title = get_fieldvalues(recid, '245__a')
                if author:
                    author = author[0]
                else:
                    author = ''
                if title:
                    title = title[0][:100]
                else:
                    title = ''
                for report in reportValues:
                    #print 'report = ' + report
                    #print 'FERMILAB-' + series
                    if re.match('FERMILAB-' + series, report, re.IGNORECASE):
                        number = re.sub("FERMILAB-" + series + "-", "", report)
                        y = [year, number, report, str(recid), author, title]
                        #print 'y = ' , y
                        reports.append(y)
    reports.sort(reverse=True)
    #print reports

    filename = os.path.join(CFG_FERMILAB_PATH,
                            'fermilab-reports-preprints.html')
    output = open(filename, 'w')
    output.write('<html>\n')
    output.write('<header>\n')
    output.write('<title>Fermilab Technical Publications: ')
    output.write('preprints')
    output.write('</title>\n')
    output.write('</header>\n')
    output.write('<body>\n')
    output.write(
        '<a href="http://bss.fnal.gov/techpubs/fermilab_spires.html">Fermilab Technical Publications</a>\n'
    )
    output.write('<br /><br />')
    dateTimeStamp = '<i>Updated ' + chicago_timezone.fromutc(
        datetime.datetime.utcnow()).strftime('%Y-%m-%d %H:%M:%S') + '</i>\n'
    output.write(dateTimeStamp)
    output.write('<br />\n<table>\n')
    for report in reports:
        line = '<tr><td><a href="http://inspirehep.net/record/' + report[
            3] + '">' + report[2] + '</a></td><td>' + report[
                4] + '</td><td>' + report[5] + '</td></tr>\n'
        output.write(line)
    output.write('</table>\n')
    output.write('</body>\n')
    output.write('</html>\n')
    output.close()
    write_message('cd /afs/fnal.gov/files/expwww/bss/html/techpubs')
    write_message('\\rm fermilab-reports-preprints.html')
    write_message('cp %s .' % filename)
Example #39
0
def arxiv_login(req, picked_profile=None):
    '''
    Log in through arxive. If user already associated to a personid, returns the personid.
    If user has no pid, try to guess which personid to associate based on surname and papers
    from arxiv. If no compatible person is found, creates a new person.
    At the end of the process opens a ticket for the user claiming the papers from arxiv.
    !!! the user will find the open ticket, which will require him to go through the
    final review before getting committed.

    @param req: Apache request object
    @type req: Apache request object

    @return: Returns the pid resulting in the process
    @rtype: int
    '''
    def session_bareinit(req):
        try:
            pinfo = session["personinfo"]
            if 'ticket' not in pinfo:
                pinfo["ticket"] = []
        except KeyError:
            pinfo = dict()
            session['personinfo'] = pinfo
            pinfo["ticket"] = []
        session.dirty = True

    session_bareinit(req)

    pinfo = session['personinfo']
    ticket = session['personinfo']['ticket']

    uinfo = collect_user_info(req)
    pinfo['external_first_entry'] = False

    try:
        name = uinfo['external_firstname']
    except KeyError:
        name = ''
    try:
        surname = uinfo['external_familyname']
    except KeyError:
        surname = ''

    if surname:
        session['personinfo']['arxiv_name'] = nameapi.create_normalized_name(
            nameapi.split_name_parts(surname + ', ' + name))
    else:
        session['personinfo']['arxiv_name'] = ''

    session.dirty = True

    try:
        arxiv_p_ids = uinfo['external_arxivids'].split(';')
    except KeyError:
        arxiv_p_ids = []

    #'external_arxivids': 'hep-th/0112017;hep-th/0112020',
    #'external_familyname': 'Weiler',
    #'external_firstname': 'Henning',

    try:
        found_bibrecs = set(
            reduce(add, [
                perform_request_search(p='037:' + str(arx), of='id', rg=0)
                for arx in arxiv_p_ids
            ]))
    except (IndexError, TypeError):
        found_bibrecs = set()

    #found_bibrecs = [567700, 567744]

    uid = getUid(req)
    pid, pid_found = dbapi.get_personid_from_uid([[uid]])

    if pid_found:
        pid = pid[0]
    else:
        if picked_profile == None:
            top5_list = dbapi.find_top5_personid_for_new_arXiv_user(
                found_bibrecs,
                nameapi.create_normalized_name(
                    nameapi.split_name_parts(surname + ', ' + name)))
            return ("top5_list", top5_list)
        else:
            pid = dbapi.check_personids_availability(picked_profile, uid)

    pid_bibrecs = set(
        [i[0] for i in dbapi.get_all_personids_recs(pid, claimed_only=True)])
    missing_bibrecs = found_bibrecs - pid_bibrecs
    #present_bibrecs = found_bibrecs.intersection(pid_bibrecs)

    #assert len(found_bibrecs) == len(missing_bibrecs) + len(present_bibrecs)

    tempticket = []
    #now we have to open the tickets...
    #person_papers contains the papers which are already assigned to the person and came from arxive,
    #they can be claimed regardless

    for bibrec in missing_bibrecs:
        tempticket.append({
            'pid': pid,
            'bibref': str(bibrec),
            'action': 'confirm'
        })

    #check if ticket targets (bibref for pid) are already in ticket
    for t in list(tempticket):
        for e in list(ticket):
            if e['pid'] == t['pid'] and e['bibref'] == t['bibref']:
                ticket.remove(e)
        ticket.append(t)

    session.dirty = True

    if picked_profile != None and picked_profile != pid and picked_profile != -1:

        return ("chosen pid not available", pid)
    elif picked_profile != None and picked_profile == pid and picked_profile != -1:
        return ("pid assigned by user", pid)
    else:
        return ("pid", pid)
    def tmpl_papers_with_self_papers_box(self, pubs, self_pubs, bibauthorid_data,
                                         num_downloads,
                                         ln, add_box=True, loading=False):
        _ = gettext_set_language(ln)
        if not loading:
            ib_pubs = intbitset(pubs)
            ib_self_pubs = intbitset(self_pubs)

            if bibauthorid_data["cid"]:
                baid_query = 'exactauthor:%s' % wrap_author_name_in_quotes_if_needed(bibauthorid_data["cid"])
            else:
                baid_query = 'exactauthor:%s' % wrap_author_name_in_quotes_if_needed(bibauthorid_data["pid"])
            baid_query = baid_query + " "

            rec_query = baid_query
            self_rec_query = baid_query + " authorcount:1 "
            descstr = ['', "<strong>" + "All papers" + "</strong>"]
            searchstr = [" All papers "]
            self_searchstr = [" Single authored "]
            if pubs:
                searchstr.append(("" +
                        create_html_link(websearch_templates.build_search_url(p=rec_query),
                        {}, str(len(pubs)) ,) + ""))
            else:
                searchstr.append(("0"))
            if self_pubs:
                self_searchstr.append(("" +
                        create_html_link(websearch_templates.build_search_url(p=self_rec_query),
                        {}, str(len(self_pubs)) ,) + ""))
            else:
                self_searchstr.append(("0"))
            psummary = searchstr
            self_psummary = self_searchstr

            if CFG_BIBRANK_SHOW_DOWNLOAD_STATS and num_downloads:
                psummary[0] += " <br> (" + _("downloaded") + " "
                psummary[0] += str(num_downloads) + " " + _("times") + ")"

            if CFG_INSPIRE_SITE:
                CFG_COLLS = ['Book',
                             'ConferencePaper',
                             'Introductory',
                             'Lectures',
                             'Published',
                             'Review',
                             'Thesis',
                             'Proceedings']
            else:
                CFG_COLLS = ['Article',
                             'Book',
                             'Preprint', ]

            collsd = {}
            self_collsd = {}

            for coll in CFG_COLLS:
                search_result = intbitset(perform_request_search(rg=0, f="collection", p=coll))
                collsd[coll] = list(ib_pubs & search_result)
                self_collsd[coll] = list(ib_self_pubs & search_result)

            for coll in CFG_COLLS:
                rec_query = baid_query + 'collection:' + wrap_author_name_in_quotes_if_needed(coll)
                self_rec_query = baid_query + 'collection:' + wrap_author_name_in_quotes_if_needed(coll) + ' authorcount:1 '
                descstr.append("%s" % coll)
                if collsd[coll]:
                    psummary.append(("" +
                             create_html_link(websearch_templates.build_search_url(p=rec_query),
                             {}, str(len(collsd[coll])),) + ''))
                else:
                    psummary.append(("0"))
                if self_collsd[coll]:
                    self_psummary.append(("" +
                             create_html_link(websearch_templates.build_search_url(p=self_rec_query),
                             {}, str(len(self_collsd[coll])),) + ''))
                else:
                    self_psummary.append(("0"))
            tp = "<tr><td> %s </td> <td align='right'> %s </td> <td align='right'> %s </td></tr>"
            line2 = "<table > %s </table>"
            line2 = line2 % ''.join(tp % (x, y, z) for x, y, z in zip(*(descstr, psummary, self_psummary)))
        else:
            line2 = self.loading_html()


        if not add_box:
            return line2
        line1 = "<strong>" + _("Papers") + "</strong>"
        papers_box = self.tmpl_print_searchresultbox("combined_papers", line1, line2)
        return papers_box
import re
import sys
import datetime
import subprocess
from invenio.search_engine import perform_request_search

VERBOSE = True
VERBOSE = False

search = raw_input("Run bibrank on this search: ")
x = perform_request_search(p=search, cc="HEP")
if len(x) > 0:
    mylist = [str(r) for r in x]
else:
    print "No records found."
    sys.exit()

today = str(datetime.date.today())
newfile = 'tmp_loss_from_search__' + today + '.txt'
output = open(newfile, 'w')
amount = str(len(mylist))
mystring = ','.join(mylist)
mystring2 = 'sudo -u apache /opt/cds-invenio/bin/bibrank -u cleggm1 \
--disable-citation-losses-check -i ' + mystring
if len(mylist) > 1000:
    print "There are %s records that will be touched" % (amount)
    chunks = [mylist[x:x + 500] for x in xrange(0, len(mylist), 500)]
    time_inter = 0
    for x in chunks:
        mystring = ','.join(x)
        time_inter += 1