def rm_existing(authors):
    ''' removes duplicate authors '''
    res = SimilaritySet(cutoff=CUTOFF)
    for a in authors:
        for b in authors:
            if b in a and string_similarity(b, a)<0.90:
                a = a.replace(b, '')
        res.add(a.strip())

    return res
def listauthors(root):
    ''' parses xml with lxml objectify parser and calls 
        filter_authors to extract authors '''
    res = SimilaritySet(cutoff=CUTOFF)
    if root.algorithm is None:
        return res
    
    # the similarity_set callback to replace or remove special characters
    res.set_callback(replace)
    for alg in root.algorithm:
        # hibakezelesek, ha nem letezik a tag akkor ne is nezze
        if alg.find("variant") is not None and alg.variant.find("author") is not None:
            res.update(filter_authors(alg.variant.author))

    return res
def filter_authors(tags):
    ''' reads the xml author tags, 
        filters duplicates and stopwords in the text ''' 
    res = SimilaritySet(cutoff=CUTOFF)
    res.set_callback(replace)
    for author in tags:
        if "confidence" in author.attrib:
            # split authors on special characters
            author_text = map(lambda x: x.strip(), re.split(tokenize_regex, turn_unicode(author.text)))
            #author_text = turn_unicode(author.text).strip()
        
        #for a in [author_text]:
        for a in author_text:
            if not len(a.split(" ")) > 6:
                res.add(a)

    res = rm_existing(res)
    res = rm_stopwords(stopwords, res)
    return res