def pinMaxQuery(pubs): '''Query google scholar use: "xxx a" OR "xxx b" OR ..., max 256 chars. return: query, used_pubs, nouse_pubs(write citation to -10 back to db) ''' printout = False maxchars = 256 query = "" total_pubs_used = 0 used_pubs = [] nouse_pubs = [] total_title_length = 0 for pub in pubs: # clean title cleaned_titles = GoogleDataCleaner.cleanGoogleTitle(pub.title) cleaned_title = cleaned_titles[0] # Add by gb Nov 05, 2011, filter out nouse titles. if cleaned_title is None or len(re.split('[\W+]', cleaned_title)) < 3: print "**** no-use-title: ", cleaned_title # pub.ncitation = -1; nouse_pubs.append(pub) continue # calc new length new_length = Extractor.__calc_control_char_length( total_pubs_used + 1) + total_title_length + len(cleaned_title) # splits = cleaned_title.split("\\s+") if splits is not None and len(splits) > 1: if total_pubs_used == 0: # if the first one-word paper, only get this. new_length += 255 else: # skip this one. continue # first pub must be here, to avoid first pub title length > 255 if total_pubs_used > 0 and new_length > maxchars: break # overflow # real pin if total_pubs_used > 0: query += 'OR' query += ''.join(('"', cleaned_title, '"')) used_pubs.append(pub) total_pubs_used += 1 total_title_length += len(cleaned_title) if printout: # DEBUG PRINT blue_temple = "\033[34m%s\033[0m" print blue_temple % 'pin query done' q = ('query(%s): %s' % (len(query), query)) print blue_temple % q t = 'use %s pubs' % total_pubs_used print blue_temple % t return query, used_pubs, nouse_pubs
def pinMaxQuery(pubs): '''Query google scholar use: "xxx a" OR "xxx b" OR ..., max 256 chars. return: query, used_pubs, nouse_pubs(write citation to -10 back to db) ''' printout = False maxchars = 256 query = "" total_pubs_used = 0 used_pubs = [] nouse_pubs = [] total_title_length = 0 for pub in pubs: # clean title cleaned_titles = GoogleDataCleaner.cleanGoogleTitle(pub.title) cleaned_title = cleaned_titles[0] # Add by gb Nov 05, 2011, filter out nouse titles. if cleaned_title is None or len(re.split('[\W+]', cleaned_title)) < 3: print "**** no-use-title: ", cleaned_title # pub.ncitation = -1; nouse_pubs.append(pub) continue # calc new length new_length = Extractor.__calc_control_char_length(total_pubs_used + 1) + total_title_length + len(cleaned_title) # splits = cleaned_title.split("\\s+") if splits is not None and len(splits) > 1: if total_pubs_used == 0: # if the first one-word paper, only get this. new_length += 255 else: # skip this one. continue # first pub must be here, to avoid first pub title length > 255 if total_pubs_used > 0 and new_length > maxchars: break # overflow # real pin if total_pubs_used > 0: query += 'OR' query += ''.join(('"', cleaned_title, '"')) used_pubs.append(pub) total_pubs_used += 1 total_title_length += len(cleaned_title) if printout:# DEBUG PRINT blue_temple = "\033[34m%s\033[0m" print blue_temple % 'pin query done' q = ('query(%s): %s' % (len(query), query)) print blue_temple % q t = 'use %s pubs' % total_pubs_used print blue_temple % t return query, used_pubs, nouse_pubs
def matchPub(self, pubs, extracted_map, check_person=False, debug_output=False): '''Match pub with extracted @return (pubs_matched, pubs_not_matched) @params: pubs - Publication read from database. extracted_map - same with all_models {key_title:[ExtractedModel,...]} check_person - if True, will check if authors is matched with authors in db.(will ignore ...). default False. Search using author:xxx do not need author check, this work is done by google. ''' if pubs is None or len(pubs) == 0: return [], pubs if extracted_map is None or len(extracted_map) == 0: return [], pubs if self.debug and False: print 'match %s pubs in %s extracted items' % (len(pubs), len(extracted_map)) # match print_not_matched = False pubs_matched = [] pubs_not_matched = [] for pub in pubs: cleanned_tuple = GoogleDataCleaner.cleanGoogleTitle(pub.title) key_title = cleanned_tuple[1] has_dot = cleanned_tuple[2] # find models list models = [] if key_title in extracted_map: # title is full, no ignore models = extracted_map[key_title] else: # title in results has ..., ignored. if has_dot: for short_key, extracted_models in extracted_map.items(): if key_title.find(short_key) != -1: models.extend(extracted_models) # exact match if models is not None and len(models) > 0: max_citation_model = None debug_all_author_string = [] for model in models: debug_all_author_string.append(model.authors) if model.authors is None or \ self.matchAuthors(model.authors, pub.authors, debug_output=False, debug_title=model.title): # if author matched. if max_citation_model is None or int( max_citation_model.ncitation) < int( model.ncitation): max_citation_model = model # select max citation? if max_citation_model is not None: # allow 10% discount if max_citation_model.ncitation >= pub.ncitation: pub.ncitation = max_citation_model.ncitation pub.increased = max_citation_model.ncitation - pub.ncitation pubs_matched.append(pub) else: # citation model not found. if debug_output: print "[DEBUG] PubMatcher.matchPub: Author not match. ", \ "\n\tTitle:%s \n\tRequired:%s \n\tGot(last):%s" % \ (pub.title, pub.authors, "\n".join(debug_all_author_string)) # print not matched? for pub in pubs: title = pub.title found = False for matched in pubs_matched: if title == matched.title: found = True break if not found: pubs_not_matched.append(pub) if print_not_matched: print 'this pub not matched: ', pub return (pubs_matched, pubs_not_matched)
def matchPub(self, pubs, extracted_map, check_person=False, debug_output=False): '''Match pub with extracted @return (pubs_matched, pubs_not_matched) @params: pubs - Publication read from database. extracted_map - same with all_models {key_title:[ExtractedModel,...]} check_person - if True, will check if authors is matched with authors in db.(will ignore ...). default False. Search using author:xxx do not need author check, this work is done by google. ''' if pubs is None or len(pubs) == 0: return [], pubs if extracted_map is None or len(extracted_map) == 0: return [], pubs if self.debug and False: print 'match %s pubs in %s extracted items' % (len(pubs), len(extracted_map)) # match print_not_matched = False pubs_matched = [] pubs_not_matched = [] for pub in pubs: cleanned_tuple = GoogleDataCleaner.cleanGoogleTitle(pub.title) key_title = cleanned_tuple[1] has_dot = cleanned_tuple[2] # find models list models = [] if key_title in extracted_map: # title is full, no ignore models = extracted_map[key_title] else: # title in results has ..., ignored. if has_dot: for short_key, extracted_models in extracted_map.items(): if key_title.find(short_key) != -1: models.extend(extracted_models) # exact match if models is not None and len(models) > 0: max_citation_model = None debug_all_author_string = [] for model in models: debug_all_author_string.append(model.authors) if model.authors is None or \ self.matchAuthors(model.authors, pub.authors, debug_output=False, debug_title=model.title): # if author matched. if max_citation_model is None or int(max_citation_model.ncitation) < int(model.ncitation): max_citation_model = model # select max citation? if max_citation_model is not None: # allow 10% discount if max_citation_model.ncitation >= pub.ncitation: pub.ncitation = max_citation_model.ncitation pub.increased = max_citation_model.ncitation - pub.ncitation pubs_matched.append(pub) else: # citation model not found. if debug_output: print "[DEBUG] PubMatcher.matchPub: Author not match. ", \ "\n\tTitle:%s \n\tRequired:%s \n\tGot(last):%s" % \ (pub.title, pub.authors, "\n".join(debug_all_author_string)) # print not matched? for pub in pubs: title = pub.title found = False; for matched in pubs_matched: if title == matched.title: found = True break if not found: pubs_not_matched.append(pub) if print_not_matched: print 'this pub not matched: ', pub return (pubs_matched, pubs_not_matched)
def matchPub(self, pubs, extracted_map, check_person=False, debug_output=False): '''Match pub with extracted @return (pubs_matched, pubs_not_matched) @params: pubs - Publication read from database. extracted_map - same with all_models {key_title:[ExtractedModel,...]} check_person - if True, will check if authors is matched with authors in db.(will ignore ...). default False. Search using author:xxx do not need author check, this work is done by google. ''' if pubs is None or len(pubs) == 0: return [], pubs if extracted_map is None or len(extracted_map) == 0: return [], pubs if self.debug and False: print 'match %s pubs in %s extracted items' % (len(pubs), len(extracted_map)) # match print_not_matched = False pubs_matched = [] pubs_not_matched = [] for pub in pubs: cleanned_tuple = GoogleDataCleaner.cleanGoogleTitle(pub.title) key_title = cleanned_tuple[1] has_dot = cleanned_tuple[2] # First Match, select loose match. ExtractedPub List models = [] # different with v1, a looser match. # title allow 10% mistake. # author loose match. for short_key, extracted_models in extracted_map.items(): matched = False if has_dot: # ignore and contains if key_title.find(short_key) != -1: matched = True _m = extracted_models for m in _m: # Add Loose Value m.looseValue += 1 models.extend(_m) else: # direct match if key_title == short_key: matched = True _m = extracted_map[key_title] for m in _m: # Add Loose Value m.looseValue += 0 models.extend(_m) # try loose match if not matched: ed = editdist.distance(short_key, key_title) if ed < 10: looseValue = float(len(key_title)) * (10 / float(100)) if looseValue > ed: # remove ed not match much _m = extracted_models for m in _m: m.looseValue += ed models.extend(_m) # if True and ed < 10: # print '-' * 100 # print 'title: %s ' % key_title # print 'short: %s ' % short_key # print 'ed is: %s ' % ed # print 'loose: %s ' % looseValue # Exact match, select who is the right one. if models is not None and len(models) > 0: max_citation_model = None for model in models: if model.authors is None or \ self.matchAuthors(model.authors, pub.authors, debug_output=False, debug_title=model.title): # if author matched. if max_citation_model is None or int( max_citation_model.ncitation) < int( model.ncitation): max_citation_model = model # select max citation? if max_citation_model is not None: if max_citation_model.ncitation >= pub.ncitation: pub.ncitation = max_citation_model.ncitation pub.increased = max_citation_model.ncitation - pub.ncitation pub.pdflink = max_citation_model.pdfLink pub.web_url = max_citation_model.web_url pubs_matched.append(pub) if pub.pdflink is None: file_object = open('web_url.txt', 'a') web_url = pub.web_url Id = str(pub.id) Title = pub.title Author = str(pub.authors) file_object.write(" ".join( [Id, Title, Author, web_url])) file_object.write("\n") file_object.close() else: file_object = open('paper_link.txt', 'a') Id = str(pub.id) Title = pub.title Pdflink = str(pub.pdflink) Author = str(pub.authors) file_object.write(" ".join( [Id, Title, Author, Pdflink])) file_object.write("\n") file_object.close() for pub in pubs: title = pub.title found = False for matched in pubs_matched: if title == matched.title: found = True break if not found: pubs_not_matched.append(pub) if print_not_matched: print 'this pub not matched: ', pub return (pubs_matched, pubs_not_matched)
def matchPub(self, pubs, extracted_map, check_person=False, debug_output=False): '''Match pub with extracted @return (pubs_matched, pubs_not_matched) @params: pubs - Publication read from database. extracted_map - same with all_models {key_title:[ExtractedModel,...]} check_person - if True, will check if authors is matched with authors in db.(will ignore ...). default False. Search using author:xxx do not need author check, this work is done by google. ''' if pubs is None or len(pubs) == 0: return [], pubs if extracted_map is None or len(extracted_map) == 0: return [], pubs if self.debug and False: print 'match %s pubs in %s extracted items' % (len(pubs), len(extracted_map)) # match print_not_matched = False pubs_matched = [] pubs_not_matched = [] for pub in pubs: cleanned_tuple = GoogleDataCleaner.cleanGoogleTitle(pub.title) key_title = cleanned_tuple[1] has_dot = cleanned_tuple[2] # First Match, select loose match. ExtractedPub List models = [] # different with v1, a looser match. # title allow 10% mistake. # author loose match. for short_key, extracted_models in extracted_map.items(): matched = False if has_dot: # ignore and contains if key_title.find(short_key) != -1: matched = True _m = extracted_models for m in _m: # Add Loose Value m.looseValue += 1; models.extend(_m) else: # direct match if key_title == short_key: matched = True _m = extracted_map[key_title] for m in _m: # Add Loose Value m.looseValue += 0; models.extend(_m); # try loose match if not matched: ed = editdist.distance(short_key, key_title) if ed < 10: looseValue = float(len(key_title)) * (10 / float(100)) if looseValue > ed: # remove ed not match much _m = extracted_models for m in _m: m.looseValue += ed; models.extend(_m) # if True and ed < 10: # print '-' * 100 # print 'title: %s ' % key_title # print 'short: %s ' % short_key # print 'ed is: %s ' % ed # print 'loose: %s ' % looseValue # Exact match, select who is the right one. if models is not None and len(models) > 0: max_citation_model = None for model in models: if model.authors is None or \ self.matchAuthors(model.authors, pub.authors, debug_output=False, debug_title=model.title): # if author matched. if max_citation_model is None or int(max_citation_model.ncitation) < int(model.ncitation): max_citation_model = model # select max citation? if max_citation_model is not None: if max_citation_model.ncitation >= pub.ncitation: pub.ncitation = max_citation_model.ncitation pub.increased = max_citation_model.ncitation - pub.ncitation pub.pdflink = max_citation_model.pdfLink pub.web_url = max_citation_model.web_url pubs_matched.append(pub) if pub.pdflink is None: file_object = open('web_url.txt', 'a') web_url= pub.web_url Id = str(pub.id) Title = pub.title Author = str(pub.authors) file_object.write(" ".join([Id, Title, Author, web_url])) file_object.write("\n") file_object.close() else: file_object = open('paper_link.txt', 'a') Id = str(pub.id) Title = pub.title Pdflink = str(pub.pdflink) Author = str(pub.authors) file_object.write(" ".join([Id, Title, Author, Pdflink])) file_object.write("\n") file_object.close() for pub in pubs: title = pub.title found = False; for matched in pubs_matched: if title == matched.title: found = True break if not found: pubs_not_matched.append(pub) if print_not_matched: print 'this pub not matched: ', pub return (pubs_matched, pubs_not_matched)