def append_identifier(d, key, obj, type): ids = exists_to_none(d, key) if ids: if isinstance(ids, list): for id in ids: obj.identifiers.append(Identifier(id, type=type)) else: obj.identifiers.append(Identifier(ids, type=type))
def do_run(self, _db): identifier = Identifier(type='You Can', identifier='Keep It') _db.add(identifier) raise RuntimeError
def do_run(self, _db): identifier = Identifier(type='Keep It', identifier='100') _db.add(identifier)
def _parse_csv(self, content, encoding='UTF-8'): csv = unicodecsv.DictReader(strip_bom(content).splitlines(), encoding=encoding) def empty_to_none(s): if s == None: return None s = s.strip() if len(s) == 0: return None return s def list_remove_empty(l): r = [] for x in l: v = empty_to_none(x) if v: r.append(v) return r def to_num(x): x = x.strip() if len(x) == 0: return 0 return int(x) for line in csv: if line['Authors'] == '[No author name available]': authors = [] else: # (mrshu): SCOPUS sa rozhodol oddelovat ako priezvyska, tak aj # jednotlive mena autorov ciarkov. Toto robi problemy, preto # preprocessujeme zoznam autorov, ktory vyzera napriklad # # Brejová, B., Brown, D.G., Li, M., Vinař, T. # # najdeme, konce celych mien, a ciarku v tomto pripade nahradime # bodkociarkou. Nasledne potom funkcii, ktora mena autorov spracovava # dame vediet, ze je ako separator pouzita bodkociarka. line['Authors'] = re.sub(r'\.,', ';', line['Authors']) authors = Author.parse_sn_first_list(line['Authors'], separator=u';') pub = Publication(line['Title'], authors, to_num(line['Year'])) source_title = empty_to_none(line['Source title']) if source_title: source_title, replacements = re.subn( r' \(including subseries [^)]+\)', '', source_title) source_title = source_title.strip() if replacements: pub.series = source_title else: pub.published_in = source_title pub.volume = empty_to_none(line['Volume']) pub.issue = empty_to_none(line['Issue']) pub.pages = make_page_range(empty_to_none(line['Page start']), empty_to_none(line['Page end'])) # (mrshu): z dovodu, ktory nedokazem pochopit teraz SCOPUS vracia cosi # ako 'DOILink', kde da dohromady tieto dva fieldy. Nepodarilo sa mi # prist na to ako to spravit rozumnejsie, tento hack to aspon rozparsuje splits = line['DOILink'].split('"') if len(splits) > 1: line['Link'] = splits[1] line['DOI'] = splits[0] else: line['Link'] = splits[0] line['DOI'] = None pub.times_cited = empty_to_none(line['Cited by']) pub.article_no = empty_to_none(line['Art. No.']) pub.publisher = empty_to_none(line['Publisher']) url = empty_to_none(line['Link']) if url: pub.source_urls.append( URL(url, type='SCOPUS', description='SCOPUS')) url_parts = urlparse(url) url_query = parse_qs(url_parts.query) if 'eid' in url_query and len: pub.identifiers.append( Identifier(url_query['eid'][0], type='SCOPUS')) for issn in list_remove_empty(line['ISSN'].split(u';')): pub.identifiers.append(Identifier(issn, type='ISSN')) for isbn in list_remove_empty(line['ISBN'].split(u';')): pub.identifiers.append(Identifier(isbn, type='ISBN')) doi = empty_to_none(line['DOI']) if doi: pub.identifiers.append(Identifier(doi, type='DOI')) pub.indexes.append(Index('SCOPUS', type='SCOPUS')) yield pub
def _parse_tab_delimited(self, text): lines = text.splitlines() columns = lines[0].split('\t') try: col_authors = columns.index('AF') except ValueError: col_authors = columns.index('AU') col_title = columns.index('TI') col_source = columns.index('SO') col_year = columns.index('PY') col_issn = columns.index('SN') col_isbn = columns.index('BN') col_doi = columns.index('DI') col_id = columns.index('UT') col_begin_page = columns.index('BP') col_end_page = columns.index('EP') col_article_no = columns.index('AR') col_book_series = columns.index('BS') col_volume = columns.index('VL') col_issue = columns.index('IS') col_special_issue = columns.index('SI') col_supplement = columns.index('SU') col_publisher = columns.index('PU') col_publisher_city = columns.index('PI') col_edition = columns.index('SE') for line in lines[1:]: data = line.split('\t') pub = Publication(data[col_title], Author.parse_sn_first_list(data[col_authors]), int(data[col_year])) if data[col_source]: pub.published_in = data[col_source] if data[col_book_series]: pub.series = data[col_book_series] if data[col_volume]: pub.volume = data[col_volume] pub.pages = make_page_range(data[col_begin_page], data[col_end_page]) if data[col_issue]: pub.issue = data[col_issue] if data[col_special_issue]: pub.special_issue = data[col_special_issue] if data[col_supplement]: pub.supplement = data[col_supplement] if data[col_article_no]: pub.article_no = data[col_article_no] if data[col_publisher]: pub.publisher = data[col_publisher] if data[col_publisher_city]: pub.publisher_city = data[col_publisher_city] if data[col_edition]: pub.edition = data[col_edition] if data[col_id]: pub.identifiers.append(Identifier(data[col_id], type='WOK')) if data[col_issn]: for issn in [x.strip() for x in data[col_issn].split(u';')]: pub.identifiers.append(Identifier(issn, type='ISSN')) if data[col_isbn]: for isbn in [x.strip() for x in data[col_isbn].split(u';')]: pub.identifiers.append(Identifier(isbn, type='ISBN')) if data[col_doi]: pub.identifiers.append(Identifier(data[col_doi], type='DOI')) yield pub
def _convert_to_publication(self, record): wokid = Identifier(unicode(record.uid), type='WOK', description='Web Of Knowledge') def extract_label(group, label): for pair in group: if pair.label == label: return pair.value return None def extract_single(group, label): l = extract_label(group, label) if l == None: return None if len(l) == 0: return None # (mrshu): Akokolvek divne sa to moze zdat, WoS ma pre niektore clanky # viac ako jedno cislo clanku #if len(l) > 1: # raise ValueError('Expecting single value only for {}, publication id {}'.format(label, wokid), str(record)) return unicode(l[0]) def extract_concat(group, label, delim=u'|'): l = extract_label(group, label) if l == None: return None if len(l) == 0: return None return delim.join(unicode(x) for x in l) title = u''.join(extract_label(record.title, 'Title')) authors = [a for p in record.authors for a in p.value] parsed_authors = [Author.parse_sn_first(unicode(x)) for x in authors] year = int(extract_single(record.source, 'Published.BiblioYear')) p = Publication(title, parsed_authors, year) p.published_in = extract_single(record.source, 'SourceTitle') p.pages = extract_single(record.source, 'Pages') p.volume = extract_single(record.source, 'Volume') p.series = extract_concat(record.source, 'BookSeriesTitle') p.issue = extract_single(record.source, 'Issue') p.special_issue = extract_single(record.source, 'SpecialIssue') p.supplement = extract_single(record.source, 'Supplement') p.article_no = extract_single(record.other, 'Identifier.article_no') p.identifiers.append(wokid) idtypes = { 'Identifier.Isbn': 'ISBN', 'Identifier.Issn': 'ISSN', 'Identifier.Doi': 'DOI', } for pair in record.other: if not pair.label in idtypes: continue for value in pair.value: p.identifiers.append( Identifier(unicode(value), type=idtypes[pair.label])) return p
from model import Identifier if __name__ == '__main__': identifier_model = Identifier() identifier_model.train(epochs=500) identifier_model.save('identifier_model')