def rm_existing(authors): ''' removes duplicate authors ''' res = SimilaritySet(cutoff=CUTOFF) for a in authors: for b in authors: if b in a and string_similarity(b, a)<0.90: a = a.replace(b, '') res.add(a.strip()) return res
def listauthors(root): ''' parses xml with lxml objectify parser and calls filter_authors to extract authors ''' res = SimilaritySet(cutoff=CUTOFF) if root.algorithm is None: return res # the similarity_set callback to replace or remove special characters res.set_callback(replace) for alg in root.algorithm: # hibakezelesek, ha nem letezik a tag akkor ne is nezze if alg.find("variant") is not None and alg.variant.find("author") is not None: res.update(filter_authors(alg.variant.author)) return res
def filter_authors(tags): ''' reads the xml author tags, filters duplicates and stopwords in the text ''' res = SimilaritySet(cutoff=CUTOFF) res.set_callback(replace) for author in tags: if "confidence" in author.attrib: # split authors on special characters author_text = map(lambda x: x.strip(), re.split(tokenize_regex, turn_unicode(author.text))) #author_text = turn_unicode(author.text).strip() #for a in [author_text]: for a in author_text: if not len(a.split(" ")) > 6: res.add(a) res = rm_existing(res) res = rm_stopwords(stopwords, res) return res