Ejemplo n.º 1
0
def custom_similarity(person, key, words_dict):
    result = 0

    person_file = os.path.join(definitions.PERSONS_DIR,
                               p_lib.remove_spaces(person) + ".txt")
    if os.path.isfile(person_file):
        with open(person_file, 'r', encoding='utf8') as fr:
            tfidf_words = words_dict[key]
            sorted_tfidf_words = sorted(tfidf_words.items(),
                                        key=operator.itemgetter(1))
            max_weight = sorted_tfidf_words[-1][1]

            file_content = fr.read()
            person_words = p_lib.split_to_words(file_content.lower())
            document_dict = Counter(person_words)
            sentences_count = len(split_into_sentences(file_content))

            for word in tfidf_words:
                current_word_weight = tfidf_words[word] / max_weight
                if word in document_dict:
                    result += document_dict[word] * current_word_weight

            if sentences_count == 0:
                sentences_count = 1

            result = result / sentences_count

    result *= 55

    if result > 7:
        return 7

    return result
Ejemplo n.º 2
0
def is_profession_negative(person, profession):
    similarity_words = prof_lib.get_similarity_words(profession)
    person_file = os.path.join(definitions.PERSONS_DIR, p_lib.remove_spaces(person) + ".txt")
    if os.path.isfile(person_file):
        with open(person_file, 'r', encoding='utf8') as fr:
            content = fr.read()
            if any(x in content for x in similarity_words):
                return False

    return True
Ejemplo n.º 3
0
def find_nationality_similarity(person_name, nationality):
    global model

    person_name = persons.remove_spaces(person_name)
    nationality = nationalities.remove_spaces(nationality)
    try:
        return custom_similarity(abs(model.similarity(person_name.lower(), nationality.lower())), NATIONALITY_MULTIPLIER)
    except Exception as e:
        # logging.error(traceback.format_exc())
        return definitions.DEFAULT_SIMILARITY
Ejemplo n.º 4
0
def is_nationality_negative(person, nationality):
    person_file = os.path.join(definitions.PERSONS_DIR, p_lib.remove_spaces(person) + ".txt")
    if os.path.isfile(person_file):
        with open(person_file, 'r', encoding='utf8') as fr:
            content = fr.read()
            for synonym, coun in nat_lib.nationalities_dict.items():
                content = content.replace(synonym, coun)

            if nationality in content:
                return False

    return True
Ejemplo n.º 5
0
def handle_mayreferto_person(*args):
    line = args[0]
    person_name = line.split('	', 1)[0]
    modified_name = persons.remove_spaces(person_name)
    file_name = os.path.join(PERSONS_DIR, modified_name + '.txt')
    if os.path.isfile(file_name) and os.path.getsize(file_name) < 200:
        with open(file_name, encoding='utf8', mode='r') as person_file:
            first_line = person_file.readline()
            if 'may refer to' in first_line\
                    or 'is the name of' in first_line:
                person_file.close()
                os.remove(person_file.name)
                download_file(person_name, file_name)
                print(person_name)
def add_training_data(professions):
    with open(os.path.join(TRAINING_DIR, 'all_positive_profession.train'),
              encoding='utf8',
              mode='r') as f:
        for i, line in enumerate(f):
            splitted = line.rstrip().split('	')
            person = splitted[0]
            profession = splitted[1]

            with open(os.path.join(definitions.PERSONS_DIR,
                                   p_lib.remove_spaces(person) + ".txt"),
                      'r',
                      encoding='utf8') as pf:
                professions[profession] += "\n" + pf.read()
                print(i, person)
def add_training_data(nationalities):
    with open(os.path.join(TRAINING_DIR, 'all_positive_nationality.train'),
              encoding='utf8',
              mode='r') as f:
        for i, line in enumerate(f):
            splitted = line.rstrip().split('	')
            person = splitted[0]
            nationality = splitted[1]

            with open(os.path.join(definitions.PERSONS_DIR,
                                   p_lib.remove_spaces(person) + ".txt"),
                      'r',
                      encoding='utf8') as pf:
                nationalities[nationality] += pf.read() + "\n"
                print(i, person)
Ejemplo n.º 8
0
def find_profession_similarity(person_name, profession):
    global model

    person_name = persons.remove_spaces(person_name)
    profession_words = professions.get_similarity_words(profession)
    result = 0
    total_count = 0
    for word in profession_words:
        try:
            result += abs(model.similarity(person_name.lower(), word.lower()))
            total_count += 1
        except Exception as e:
            # logging.error(traceback.format_exc())
            return definitions.DEFAULT_SIMILARITY

    result /= total_count
    return custom_similarity(result, PROFESSION_MULTIPLIER)
Ejemplo n.º 9
0
def find_similarity(person_name, term, inputType):
    result = {}
    person_file = os.path.join(definitions.PERSONS_DIR,
                               p_lib.remove_spaces(person_name) + ".txt")
    if os.path.isfile(person_file):
        with open(person_file, 'r', encoding='utf8') as f:
            if inputType == definitions.TYPE_NATIONALITY:
                result = get_person_nationalities(f)
            elif inputType == definitions.TYPE_PROFESSION:
                result = get_person_professions(f)
            else:
                raise TypeError

    if (term in result.keys()):
        return result[term]

    return 0
Ejemplo n.º 10
0
def get_positive_nationality(person):
    nationalities_empty_dict = init_nationalities_empty_dict()
    person_file = os.path.join(definitions.PERSONS_DIR, p_lib.remove_spaces(person) + ".txt")
    if os.path.isfile(person_file):
        with open(person_file, 'r', encoding='utf8') as fr:
            first_line = fr.readline()
            fr.seek(0)
            content = fr.read()
            for synonym, coun in nat_lib.nationalities_dict.items():
                first_line = first_line.replace(synonym, coun)
                content = content.replace(synonym, coun)

            mentioned_nationalities = tuple(
                temp_nationality for temp_nationality in nationalities_empty_dict if temp_nationality in content)

            if len(mentioned_nationalities) == 2 and 'Republic of Ireland' in mentioned_nationalities:
                mentioned_nationalities = ['Republic of Ireland']

            if len(mentioned_nationalities) == 1 and mentioned_nationalities[0] in first_line:
                return mentioned_nationalities[0]
    return None
Ejemplo n.º 11
0
def get_positive_profession(person):
    professions_empty_dict = init_professions_empty_dict()
    person_file = os.path.join(definitions.PERSONS_DIR, p_lib.remove_spaces(person) + ".txt")
    if os.path.isfile(person_file):
        with open(person_file, 'r', encoding='utf8') as fr:
            first_line = fr.readline()
            fr.seek(0)
            content = fr.read()
            mentioned_professions = []
            mentioned_professions_first_sentence = []

            for profession in professions_empty_dict:
                similarity_words = prof_lib.get_similarity_words(profession)
                if all(x in content for x in similarity_words):
                    mentioned_professions.append(profession)
                if all(x in first_line for x in similarity_words):
                    mentioned_professions_first_sentence.append(profession)

            if len(mentioned_professions) == 1 and len(mentioned_professions_first_sentence) == 1:
                return mentioned_professions_first_sentence[0]
    return None
def download_file(*args):
    line = args[0]
    person_name = line.split('	', 1)[0]
    modified_name = persons.remove_spaces(person_name)
    file_name = os.path.join(PERSONS_DIR, modified_name + '.txt')
    url = 'http://dbpedia.org/page/' + urllib.parse.quote(modified_name)

    dbpedia_file = None
    try:
        if not os.path.isfile(file_name):
            html_content = get_html_content(url)
            html_content = modify_html_content(html_content)

            if len(html_content) > 0:
                dbpedia_file = open(file_name, encoding='utf8', mode='x')
                dbpedia_file.write(html_content)
    except urllib.error.HTTPError as e:
        print(str(e.code) + ": " + url)
    except Exception as e:
        logging.error(traceback.format_exc())
    finally:
        if dbpedia_file != None:
            dbpedia_file.close()
Ejemplo n.º 13
0
import os
from wsdm.ts.helpers.persons import persons
from definitions import NOMENCLATURES_DIR
from definitions import PERSONS_DIR

if __name__ == '__main__':
    with open(os.path.join(NOMENCLATURES_DIR, "persons.txt"),
              encoding='utf8',
              mode='r') as fr:
        with open(os.path.join(NOMENCLATURES_DIR, "missing_persons.txt"),
                  encoding='utf8',
                  mode='w') as fw:
            for line in fr:
                person_name = line.split('	', 1)[0]
                modified_name = persons.remove_spaces(person_name)
                file_name = os.path.join(PERSONS_DIR, modified_name + '.txt')
                if not os.path.isfile(file_name):
                    fw.write(line)
Ejemplo n.º 14
0
def has_file(person):
    person_file = os.path.join(definitions.PERSONS_DIR,
                               p_lib.remove_spaces(person) + ".txt")
    return os.path.isfile(person_file)