コード例 #1
0
ファイル: scopusapi.py プロジェクト: 3660628/citacie
 def append_identifier(d, key, obj, type):
     ids = exists_to_none(d, key)
     if ids:
         if isinstance(ids, list):
             for id in ids:
                 obj.identifiers.append(Identifier(id, type=type))
         else:
             obj.identifiers.append(Identifier(ids, type=type))
コード例 #2
0
 def do_run(self, _db):
     identifier = Identifier(type='You Can', identifier='Keep It')
     _db.add(identifier)
     raise RuntimeError
コード例 #3
0
 def do_run(self, _db):
     identifier = Identifier(type='Keep It', identifier='100')
     _db.add(identifier)
コード例 #4
0
ファイル: scopus.py プロジェクト: 3660628/citacie
    def _parse_csv(self, content, encoding='UTF-8'):
        csv = unicodecsv.DictReader(strip_bom(content).splitlines(),
                                    encoding=encoding)

        def empty_to_none(s):
            if s == None:
                return None
            s = s.strip()
            if len(s) == 0:
                return None
            return s

        def list_remove_empty(l):
            r = []
            for x in l:
                v = empty_to_none(x)
                if v:
                    r.append(v)
            return r

        def to_num(x):
            x = x.strip()
            if len(x) == 0:
                return 0
            return int(x)

        for line in csv:
            if line['Authors'] == '[No author name available]':
                authors = []
            else:
                # (mrshu): SCOPUS sa rozhodol oddelovat ako priezvyska, tak aj
                # jednotlive mena autorov ciarkov. Toto robi problemy, preto
                # preprocessujeme zoznam autorov, ktory vyzera napriklad
                #
                # Brejová, B., Brown, D.G., Li, M., Vinař, T.
                #
                # najdeme, konce celych mien, a ciarku v tomto pripade nahradime
                # bodkociarkou. Nasledne potom funkcii, ktora mena autorov spracovava
                # dame vediet, ze je ako separator pouzita bodkociarka.
                line['Authors'] = re.sub(r'\.,', ';', line['Authors'])
                authors = Author.parse_sn_first_list(line['Authors'],
                                                     separator=u';')
            pub = Publication(line['Title'], authors, to_num(line['Year']))
            source_title = empty_to_none(line['Source title'])
            if source_title:
                source_title, replacements = re.subn(
                    r' \(including subseries [^)]+\)', '', source_title)
                source_title = source_title.strip()
                if replacements:
                    pub.series = source_title
                else:
                    pub.published_in = source_title
            pub.volume = empty_to_none(line['Volume'])
            pub.issue = empty_to_none(line['Issue'])
            pub.pages = make_page_range(empty_to_none(line['Page start']),
                                        empty_to_none(line['Page end']))

            # (mrshu): z dovodu, ktory nedokazem pochopit teraz SCOPUS vracia cosi
            # ako 'DOILink', kde da dohromady tieto dva fieldy. Nepodarilo sa mi
            # prist na to ako to spravit rozumnejsie, tento hack to aspon rozparsuje
            splits = line['DOILink'].split('"')
            if len(splits) > 1:
                line['Link'] = splits[1]
                line['DOI'] = splits[0]
            else:
                line['Link'] = splits[0]
                line['DOI'] = None

            pub.times_cited = empty_to_none(line['Cited by'])
            pub.article_no = empty_to_none(line['Art. No.'])
            pub.publisher = empty_to_none(line['Publisher'])
            url = empty_to_none(line['Link'])

            if url:
                pub.source_urls.append(
                    URL(url, type='SCOPUS', description='SCOPUS'))
                url_parts = urlparse(url)
                url_query = parse_qs(url_parts.query)
                if 'eid' in url_query and len:
                    pub.identifiers.append(
                        Identifier(url_query['eid'][0], type='SCOPUS'))

            for issn in list_remove_empty(line['ISSN'].split(u';')):
                pub.identifiers.append(Identifier(issn, type='ISSN'))

            for isbn in list_remove_empty(line['ISBN'].split(u';')):
                pub.identifiers.append(Identifier(isbn, type='ISBN'))

            doi = empty_to_none(line['DOI'])
            if doi:
                pub.identifiers.append(Identifier(doi, type='DOI'))

            pub.indexes.append(Index('SCOPUS', type='SCOPUS'))

            yield pub
コード例 #5
0
ファイル: wok.py プロジェクト: 3660628/citacie
 def _parse_tab_delimited(self, text):
     lines = text.splitlines()
     columns = lines[0].split('\t')
     try:
         col_authors = columns.index('AF')
     except ValueError:
         col_authors = columns.index('AU')
     col_title = columns.index('TI')
     col_source = columns.index('SO')
     col_year = columns.index('PY')
     col_issn = columns.index('SN')
     col_isbn = columns.index('BN')
     col_doi = columns.index('DI')
     col_id = columns.index('UT')
     col_begin_page = columns.index('BP')
     col_end_page = columns.index('EP')
     col_article_no = columns.index('AR')
     col_book_series = columns.index('BS')
     col_volume = columns.index('VL')
     col_issue = columns.index('IS')
     col_special_issue = columns.index('SI')
     col_supplement = columns.index('SU')
     col_publisher = columns.index('PU')
     col_publisher_city = columns.index('PI')
     col_edition = columns.index('SE')
     for line in lines[1:]:
         data = line.split('\t')
         pub = Publication(data[col_title],
                           Author.parse_sn_first_list(data[col_authors]),
                           int(data[col_year]))
         if data[col_source]:
             pub.published_in = data[col_source]
         if data[col_book_series]:
             pub.series = data[col_book_series]
         if data[col_volume]:
             pub.volume = data[col_volume]
         pub.pages = make_page_range(data[col_begin_page],
                                     data[col_end_page])
         if data[col_issue]:
             pub.issue = data[col_issue]
         if data[col_special_issue]:
             pub.special_issue = data[col_special_issue]
         if data[col_supplement]:
             pub.supplement = data[col_supplement]
         if data[col_article_no]:
             pub.article_no = data[col_article_no]
         if data[col_publisher]:
             pub.publisher = data[col_publisher]
         if data[col_publisher_city]:
             pub.publisher_city = data[col_publisher_city]
         if data[col_edition]:
             pub.edition = data[col_edition]
         if data[col_id]:
             pub.identifiers.append(Identifier(data[col_id], type='WOK'))
         if data[col_issn]:
             for issn in [x.strip() for x in data[col_issn].split(u';')]:
                 pub.identifiers.append(Identifier(issn, type='ISSN'))
         if data[col_isbn]:
             for isbn in [x.strip() for x in data[col_isbn].split(u';')]:
                 pub.identifiers.append(Identifier(isbn, type='ISBN'))
         if data[col_doi]:
             pub.identifiers.append(Identifier(data[col_doi], type='DOI'))
         yield pub
コード例 #6
0
ファイル: wok.py プロジェクト: 3660628/citacie
    def _convert_to_publication(self, record):
        wokid = Identifier(unicode(record.uid),
                           type='WOK',
                           description='Web Of Knowledge')

        def extract_label(group, label):
            for pair in group:
                if pair.label == label:
                    return pair.value
            return None

        def extract_single(group, label):
            l = extract_label(group, label)
            if l == None:
                return None
            if len(l) == 0:
                return None
            # (mrshu): Akokolvek divne sa to moze zdat, WoS ma pre niektore clanky
            # viac ako jedno cislo clanku
            #if len(l) > 1:
            #  raise ValueError('Expecting single value only for {}, publication id {}'.format(label, wokid), str(record))
            return unicode(l[0])

        def extract_concat(group, label, delim=u'|'):
            l = extract_label(group, label)
            if l == None:
                return None
            if len(l) == 0:
                return None
            return delim.join(unicode(x) for x in l)

        title = u''.join(extract_label(record.title, 'Title'))
        authors = [a for p in record.authors for a in p.value]
        parsed_authors = [Author.parse_sn_first(unicode(x)) for x in authors]
        year = int(extract_single(record.source, 'Published.BiblioYear'))
        p = Publication(title, parsed_authors, year)

        p.published_in = extract_single(record.source, 'SourceTitle')
        p.pages = extract_single(record.source, 'Pages')
        p.volume = extract_single(record.source, 'Volume')
        p.series = extract_concat(record.source, 'BookSeriesTitle')
        p.issue = extract_single(record.source, 'Issue')
        p.special_issue = extract_single(record.source, 'SpecialIssue')
        p.supplement = extract_single(record.source, 'Supplement')
        p.article_no = extract_single(record.other, 'Identifier.article_no')

        p.identifiers.append(wokid)

        idtypes = {
            'Identifier.Isbn': 'ISBN',
            'Identifier.Issn': 'ISSN',
            'Identifier.Doi': 'DOI',
        }

        for pair in record.other:
            if not pair.label in idtypes:
                continue
            for value in pair.value:
                p.identifiers.append(
                    Identifier(unicode(value), type=idtypes[pair.label]))

        return p
コード例 #7
0
from model import Identifier

if __name__ == '__main__':
    identifier_model = Identifier()
    identifier_model.train(epochs=500)
    identifier_model.save('identifier_model')