def rm_existing(authors): ''' removes duplicate authors ''' res = SimilaritySet(cutoff=CUTOFF) for a in authors: for b in authors: if b in a and string_similarity(b, a)<0.90: a = a.replace(b, '') res.add(a.strip()) return res
def filter_authors(tags): ''' reads the xml author tags, filters duplicates and stopwords in the text ''' res = SimilaritySet(cutoff=CUTOFF) res.set_callback(replace) for author in tags: if "confidence" in author.attrib: # split authors on special characters author_text = map(lambda x: x.strip(), re.split(tokenize_regex, turn_unicode(author.text))) #author_text = turn_unicode(author.text).strip() #for a in [author_text]: for a in author_text: if not len(a.split(" ")) > 6: res.add(a) res = rm_existing(res) res = rm_stopwords(stopwords, res) return res