def process_train(self, selected, train): ''' 训练数据预处理 ''' obs = len(selected) * 2 # 全体样本的数量 nvars = len(self.f2v) # 特征向量维度 X = np.zeros(shape=(obs, nvars), dtype=np.float32) y = np.zeros(shape=(obs), dtype=np.float32) obsnum = 0 # 将每个样本转换为特征向量 for example in tqdm(train): cid = example['id'] if cid in selected: claim = example['claim'] c_toks = set(word_tokenize(claim.lower())) for yn in selected[cid]: [title, lid, line, tscore] = selected[cid][yn] t_toks = normalize_title(title) t = ' '.join(t_toks) t_toks = set(t_toks) l_toks = set(word_tokenize(line.lower())) self.process_instance(c_toks, t, t_toks, line, l_toks, lid, tscore, obsnum, X) y[obsnum] = float(yn) obsnum += 1 assert obsnum == obs return X, y
def phrase_features(phrase="", start=0, title="", claim=""): features = dict() stoks = phrase.split() t_toks, rmndr = normalize_title(title, rflag=True) features["rmndr"] = (rmndr == "") features["rinc"] = ((rmndr != "") and (rmndr in claim)) features["start"] = start features["start0"] = (start == 0) features["lend"] = len(stoks) features["lend1"] = (features["lend"] == 1) features["cap1"] = stoks[0][0].isupper() features["stop1"] = (stoks[0].lower() in stop) features["people1"] = (stoks[0] in people) features["places1"] = (stoks[0] in places) features["capany"] = False features["capall"] = True features["stopany"] = False features["stopall"] = True features["peopleany"] = False features["peopleall"] = True features["placesany"] = False features["placesall"] = True for tok in stoks: features["capany"] = (features["capany"] or tok[0].isupper()) features["capall"] = (features["capall"] and tok[0].isupper()) features["stopany"] = (features["stopany"] or tok.lower() in stop) features["stopall"] = (features["stopall"] and tok.lower() in stop) features["peopleany"] = (features["peopleany"] or tok in people) features["peopleall"] = (features["peopleall"] and tok in people) features["placesany"] = (features["placesany"] or tok in places) features["placesall"] = (features["placesall"] and tok in places) return features
def title_edict(t2jnum={}): edocs = edict() for title in t2jnum: l_txt = normalize_title(title) if len(l_txt) > 0: if edocs[l_txt][0] is None: edocs[l_txt] = [] edocs[l_txt][0].append(title) return edocs
def normalize_fields(data): if data.get('authors') is None: data['authors'] = [] authors = {util.normalize_author(a) for a in data['authors']} data['norm-authors'] = sorted(authors) if data.get('title') is None: data['title'] = '' data['norm-title'] = util.normalize_title(data['title']) return data
def mint_page_for_url(self, page_class, url): my_page = page_class() my_page.pmh_id = self.id my_page.url = url my_page.doi = self.doi my_page.title = self.title my_page.normalized_title = normalize_title(self.title) my_page.authors = self.authors my_page.repo_id = self.id.split(":")[1] return my_page
def title_edict(t2jnum={}): ''' 建立文档标题的字典 ''' edocs = edict() for title in t2jnum: _title = normalize_title(title) if len(_title) > 0: if edocs[_title][0] is None: edocs[_title] = [] edocs[_title][0].append(title) return edocs
def phrase_features(phrase='', start=0, title='', claim=''): ''' 构建句子的特征字典: 参数: title: 文档标题 phrase: claim 中的短语 claim: 对比的声明 start: phrase 在 claim 中的位置 返回值: features: 句子的特征字典 ''' features = dict() # 特征字典 stoks = phrase.split() # 分词 _, rmndr = normalize_title(title, rflag=True) # 标准化并分割标题 features['rmndr'] = (rmndr == '') # True: 不存在潜在信息:(xxx) features['rinc'] = ( (rmndr != '') and (rmndr in claim)) # True: 存在潜在信息:(xxx)且 xxx 在 claim 存在 features['start'] = start # 在 claim 中标题的位置 features['start0'] = (start == 0) # 在 claim 首部 features['lend'] = len(stoks) # 词数 features['lend1'] = (features['lend'] == 1) # True: 只有一个单词 features['cap1'] = stoks[0][0].isupper() # True: 第一个单词是首字母是大写 features['stop1'] = (stoks[0].lower() in stop) # True:第一个单词是停用词 features['people1'] = (stoks[0] in people) # True:第一个单词是人名 features['places1'] = (stoks[0] in places) # True:第一个单词是地名 features['capany'] = False # True:包含首字母大写的单词 features['capall'] = True # True:每个单词的首字母都是大写 features['stopany'] = False # True:存在停用词 features['stopall'] = True # True:所有词都为停用词 features['peopleany'] = False # True:存在人名 features['peopleall'] = True # True:所有词都为人名 features['placesany'] = False # True:存在地名 features['placesall'] = True # True:所有词都为地名 for tok in stoks: features['capany'] = (features['capany'] or tok[0].isupper()) features['capall'] = (features['capall'] and tok[0].isupper()) features['stopany'] = (features['stopany'] or tok.lower() in stop) features['stopall'] = (features['stopall'] and tok.lower() in stop) features['peopleany'] = (features['peopleany'] or tok in people) features['peopleall'] = (features['peopleall'] and tok in people) features['placesany'] = (features['placesany'] or tok in places) features['placesall'] = (features['placesall'] and tok in places) return features
def calc_normalized_title(self): if not self.title: return None working_title = self.title # repo specific rules # AMNH adds biblio to the end of titles, which ruins match. remove this. # example http://digitallibrary.amnh.org/handle/2246/6816 if "amnh.org" in self.repo_id: # cut off the last part, after an openning paren working_title = re.sub(u"(Bulletin of.+no.+\d+)", "", working_title, re.IGNORECASE | re.MULTILINE) working_title = re.sub(u"(American Museum nov.+no.+\d+)", "", working_title, re.IGNORECASE | re.MULTILINE) return normalize_title(working_title)
def best_lines(claim="",tscores=list(),lines=dict(),best=5,model=None): lscores=list() c_toks=set(word_tokenize(claim.lower())) for title,tscore in tscores: t_toks=normalize_title(title) t=" ".join(t_toks) t_toks=set(t_toks) for lid in lines[title]: line=lines[title][lid] l_toks=set(word_tokenize(line.lower())) if len(l_toks) > 0: if model==None: lscores.append((title,lid,score_line(line_features(c_toks,t,t_toks,line,l_toks,lid,tscore)))) else: lscores.append((title,lid,model.score_instance(c_toks,t,t_toks,line,l_toks,lid,tscore))) lscores=sorted(lscores,key=lambda x:-1*x[2])[:best] return lscores
def calc_normalized_title(self): if not self.title: return None working_title = self.title # repo specific rules # AMNH adds biblio to the end of titles, which ruins match. remove this. # example http://digitallibrary.amnh.org/handle/2246/6816 oai:digitallibrary.amnh.org:2246/6816 if "amnh.org" in self.id: # cut off the last part, after an openning paren working_title = re.sub(u"(Bulletin of.+no.+\d+)", "", working_title, re.IGNORECASE | re.MULTILINE) working_title = re.sub(u"(American Museum nov.+no.+\d+)", "", working_title, re.IGNORECASE | re.MULTILINE) # for endpoint 0dde28a908329849966, adds this to end of all titles, so remove (eg http://hdl.handle.net/11858/00-203Z-0000-002E-72BD-3) working_title = re.sub(u"vollständige digitalisierte Ausgabe", "", working_title, re.IGNORECASE | re.MULTILINE) return normalize_title(working_title)
def best_lines(claim='', tscores=list(), lines=dict(), best=5, model=None): ''' 计算在得分最高(前 best 个)的 line ''' lscores = list() c_toks = set(word_tokenize(claim.lower())) for title, tscore in tscores: t_toks = normalize_title(title) # 对 title 进行处理 t = ' '.join(t_toks) t_toks = set(t_toks) for lid in lines[title]: # 获取标题对应句子的行号 line = lines[title][lid] l_toks = set(word_tokenize(line.lower())) if len(l_toks) > 0: lscores.append((title, lid, model.score_instance(c_toks, t, t_toks, line, l_toks, lid, tscore))) lscores = sorted(lscores, key=lambda x: -1 * x[2])[:best] return lscores
def process_train(self,selected,train): obs=len(selected)*2 nvars=len(self.f2v) X=np.zeros(shape=(obs,nvars),dtype=np.float32) y=np.zeros(shape=(obs),dtype=np.float32) obsnum=0 for example in tqdm(train): cid=example["id"] if cid in selected: claim=example["claim"] c_toks=set(word_tokenize(claim.lower())) for yn in selected[cid]: [title,lid,line,tscore]=selected[cid][yn] t_toks=normalize_title(title) t=" ".join(t_toks) t_toks=set(t_toks) l_toks=set(word_tokenize(line.lower())) self.process_instance(c_toks,t,t_toks,line,l_toks,lid,tscore,obsnum,X) y[obsnum]=float(yn) obsnum+=1 assert obsnum==obs return X,y
def main(): """print title each per one line from the corpus""" year = 2014 # months = ['01', '02', '03', '04', '05', '06', '07'] # 2015-08-05 months = range(11, 13) # months = ['02'] # 2015-08-13 # months = ['02', '03', '04', '05'], 2015-08-05 # months = ['03'] # 2015-08-13 days = xrange(1, 32) paths = [ '/cs/puls/Corpus/Business/Puls/{}/{}/{:2d}/'.format(year, month, day) for month in months for day in days ] collected = 0 for i, fname in enumerate(get_file_names(paths)): if i % 100 == 0: logger.info("{} / {}".format(collected, i)) try: title = extract_title(fname) except: logger.debug('Fail to find title') continue if not title: # no title continue title = normalize_title(title) # is not monocase and is English if not is_monocase(nltk.word_tokenize(title)) and\ guessLanguage(title) == "en": body = get_document_content_paf(fname) if len(body.strip()) > 0: # non-empty collected += 1 print json.dumps([fname, unicode(title).encode("utf8")])
def update(self): if not self.crossref_api_raw_new: self.crossref_api_raw_new = self.crossref_api_raw if not self.title: self.title = self.crossref_title self.normalized_title = normalize_title(self.title) old_response_jsonb = self.response_jsonb self.clear_results() try: self.recalculate() except NoDoiException: logger.info(u"invalid doi {}".format(self)) self.error += "Invalid DOI" pass self.set_results() if self.has_changed(old_response_jsonb): self.last_changed_date = datetime.datetime.utcnow().isoformat()
def main(): """print title each per one line from the corpus""" year = 2014 # months = ['01', '02', '03', '04', '05', '06', '07'] # 2015-08-05 months = range(11, 13) # months = ['02'] # 2015-08-13 # months = ['02', '03', '04', '05'], 2015-08-05 # months = ['03'] # 2015-08-13 days = xrange(1, 32) paths = ['/cs/puls/Corpus/Business/Puls/{}/{}/{:2d}/'.format(year, month, day) for month in months for day in days] collected = 0 for i, fname in enumerate(get_file_names(paths)): if i % 100 == 0: logger.info("{} / {}".format(collected, i)) try: title = extract_title(fname) except: logger.debug('Fail to find title') continue if not title: # no title continue title = normalize_title(title) # is not monocase and is English if not is_monocase(nltk.word_tokenize(title)) and\ guessLanguage(title) == "en": body = get_document_content_paf(fname) if len(body.strip()) > 0: # non-empty collected += 1 print json.dumps([fname, unicode(title).encode("utf8")])
def mint_pages(self): if u"oai:" not in self.id: return self.pages = [] for url in self.get_good_urls(): if self.doi: my_page = self.mint_page_for_url(PageDoiMatch, url) self.pages.append(my_page) if self.title: normalized_title = normalize_title(self.title) if normalized_title: my_page = self.mint_page_for_url(PageTitleMatch, url) pages_with_this_normalized_title = PageTitleMatch.query.filter( PageTitleMatch.normalized_title == normalized_title).all() if len(pages_with_this_normalized_title) >= 20: my_page.more_than_20 = True self.pages.append(my_page) return self.pages
def phrase_features(phrase="", start=0, title="", claim="", ctoks=word_tokenize("dummy"), termfreqs=dict()): features = dict() stoks = phrase.split() t_toks, rmndr = normalize_title(title, rflag=True) features["terms"] = 0 features["terms0"] = 0 numtoks = 0 for tok in ctoks: if tok in termfreqs: tf, tf0 = termfreqs[tok] features["terms"] += (tf > 0) features["terms0"] += (tf0 > 0) numtoks += 1 features["rmndr"] = (rmndr == "") features["rinc"] = ((rmndr != "") and (rmndr in claim)) features["start"] = start features["start0"] = (start == 0) features["lend"] = len(stoks) features["lend1"] = (features["lend"] == 1) features["cap1"] = stoks[0][0].isupper() features["stop1"] = (stoks[0].lower() in stop) features["capany"] = False features["capall"] = True features["stopany"] = False features["stopall"] = True for tok in stoks: features["capany"] = (features["capany"] or tok[0].isupper()) features["capall"] = (features["capall"] and tok[0].isupper()) features["stopany"] = (features["stopany"] or tok.lower() in stop) features["stopall"] = (features["stopall"] and tok.lower() in stop) return features
def normalize_title(meta): meta['norm-title'] = util.normalize_title(meta['title']) return meta
def api_to_db(query_doi=None, first=None, last=None, today=False, chunk_size=None): i = 0 records_to_save = [] headers = {"Accept": "application/json", "User-Agent": "impactstory.org"} base_url_with_last = "http://api.crossref.org/works?filter=from-created-date:{first},until-created-date:{last}&rows=1000&cursor={next_cursor}" base_url_no_last = "http://api.crossref.org/works?filter=from-created-date:{first}&rows=1000&cursor={next_cursor}" base_url_doi = "http://api.crossref.org/works?filter=doi:{doi}" # but if want all changes, use "indexed" not "created" as per https://github.com/CrossRef/rest-api-doc/blob/master/rest_api.md#notes-on-incremental-metadata-updates # base_url_with_last = "http://api.crossref.org/works?filter=from-indexed-date:{first},until-indexed-date:{last}&rows=1000&cursor={next_cursor}" # base_url_no_last = "http://api.crossref.org/works?filter=from-indexed-date:{first}&rows=1000&cursor={next_cursor}" next_cursor = "*" has_more_responses = True num_so_far = 0 if today: last = datetime.date.today().isoformat() first = (datetime.date.today() - datetime.timedelta(days=2)).isoformat() if not first: first = "2016-04-01" while has_more_responses: if query_doi: url = base_url_doi.format(doi=query_doi) else: if last: url = base_url_with_last.format(first=first, last=last, next_cursor=next_cursor) else: # query is much faster if don't have a last specified, even if it is far in the future url = base_url_no_last.format(first=first, next_cursor=next_cursor) logger.info(u"calling url: {}".format(url)) start_time = time() resp = requests.get(url, headers=headers) logger.info(u"getting crossref response took {} seconds".format( elapsed(start_time, 2))) if resp.status_code != 200: logger.info(u"error in crossref call, status_code = {}".format( resp.status_code)) return resp_data = resp.json()["message"] next_cursor = resp_data.get("next-cursor", None) if next_cursor: next_cursor = quote(next_cursor) if not resp_data["items"] or not next_cursor: has_more_responses = False for api_raw in resp_data["items"]: # logger.info(u":") api = {} doi = api_raw["DOI"].lower() # using _source key for now because that's how it came out of ES and # haven't switched everything over yet api["_source"] = build_crossref_record(api_raw) api["_source"]["doi"] = doi my_pub = Pub(id=doi, api=api, api_raw=api_raw) my_pub.title = api["_source"]["title"] my_pub.normalized_title = normalize_title(my_pub.title) db.session.merge(my_pub) logger.info(u"got record {}".format(my_pub)) records_to_save.append(my_pub) if len(records_to_save) >= 100: safe_commit(db) num_so_far += len(records_to_save) records_to_save = [] logger.info( u"committing. have committed {} so far, in {} seconds, is {} per hour" .format(num_so_far, elapsed(start_time, 1), num_so_far / (elapsed(start_time, 1) / (60 * 60)))) logger.info(u"at bottom of loop") # make sure to get the last ones logger.info(u"saving last ones") safe_commit(db) logger.info(u"done everything")