Example #1
0
def provide_corpus_functionwords(numOfFunctionwords):
    global top_corpus_functionwords
    top_corpus_functionwords={}

    synchronized_functionwords_file_path = Path("vectors_handling/vectors/synchronized_functionwords/synchronized_functionwords.txt")

    if not util.exists(synchronized_functionwords_file_path):  # can't find the file in memory
        log('Cannot find synchronized_functionwords file')  # redundant

        corpus_functionwords = {}

        for domain_dir in os.scandir(setup.database):
            if domain_dir.name == 'europe_data' and setup.domain == 'in':
                for country_dir in os.scandir(domain_dir):
                    country_name = str.split(os.path.basename(country_dir), '.')[1]
                    log('Counting function words in ' + country_name)
                    for user_dir in os.scandir(country_dir):
                        for file_dir in os.scandir(user_dir):
                            file = open(file_dir, "r", encoding="utf-8")
                            lines = file.readlines()
                            for line in lines:
                                words = line.split()
                                for word in words:
                                    if word in function_words_map.keys():
                                        if word not in corpus_functionwords.keys():
                                            corpus_functionwords[word] = 1
                                        else:
                                            corpus_functionwords[word] += 1

        top_corpus_functionwords = heapq.nlargest(numOfFunctionwords, corpus_functionwords,
                                                  key=corpus_functionwords.get)
        util.save_file(synchronized_functionwords_file_path, top_corpus_functionwords)

    top_corpus_functionwords = util.load_file(synchronized_functionwords_file_path)
    return top_corpus_functionwords
Example #2
0
def generate_top_bipos(save_path):
    log('Generating top bichars')
    all_bipos = {}

    for domain_dir in os.scandir(setup.database):
        if domain_dir.name == 'europe_data' and setup.domain == 'in':

            for country_dir in os.scandir(domain_dir):
                country_name = str.split(os.path.basename(country_dir), '.')[1]
                log('Generating top bipos for ' + country_name)
                for user_dir in os.scandir(country_dir):

                    for file_dir in os.scandir(user_dir):
                        file = open(file_dir, "r", encoding="utf-8")
                        lines = file.readlines()

                        for line in lines:  # parse lines within chunk text
                            pos_tokens = re.split("'\), \('|'\), \(\"", line)
                            for i in range(len(pos_tokens) - 2):
                                bigram = ""
                                bigram = bigram + re.split("', '|\", '", pos_tokens[i])[1] + " "
                                bigram = bigram + re.split("', '|\", '", pos_tokens[i + 1])[1]
                                if bigram not in all_bipos.keys():
                                    all_bipos[bigram] = 1
                                else:
                                    all_bipos[bigram] += 1

    top_bipos = heapq.nlargest(300, all_bipos, key=all_bipos.get)  # fetch top 300 bipos
    util.save_file(save_path, top_bipos)
Example #3
0
def generate_top_trichars(save_path):
    log('Generating top trichars')
    all_trichars = {}

    for domain_dir in os.scandir(setup.database):
        if domain_dir.name == 'europe_data' and setup.domain == 'in':

            for country_dir in os.scandir(domain_dir):
                country_name = str.split(os.path.basename(country_dir), '.')[1]
                log('Generating top trichars for ' + country_name)
                for user_dir in os.scandir(country_dir):

                    for file_dir in os.scandir(user_dir):
                        file = open(file_dir, "r", encoding="utf-8")
                        lines = file.readlines()
                        for line in lines:  # parse lines within chunk text

                            if len(line) >= 11:
                                cur_char = 0
                                while cur_char < len(line):

                                    trichar = line[cur_char + 1] + line[cur_char + 4] + line[cur_char + 7]
                                    if trichar not in all_trichars.keys():
                                        all_trichars[trichar] = 1
                                    else:
                                        all_trichars[trichar] += 1
                                    cur_char += 11

    top_trichars = heapq.nlargest(1000, all_trichars, key=all_trichars.get)  # fetch top 1000 trichars
    util.save_file(save_path, top_trichars)
Example #4
0
def generate_top_unigrams(save_path):
    log('Generating top unigrams')
    all_unigrams = {}

    for domain_dir in os.scandir(setup.database):
        if domain_dir.name == 'europe_data' and setup.domain == 'in':

            for country_dir in os.scandir(domain_dir):
                country_name = str.split(os.path.basename(country_dir), '.')[1]
                log('Generating top unigrams for ' + country_name)
                for user_dir in os.scandir(country_dir):

                    for file_dir in os.scandir(user_dir):
                        file = open(file_dir, "r", encoding="utf-8")
                        lines = file.readlines()
                        for line in lines:
                            unigrams = line.split()
                            for token in unigrams:
                                if token not in all_unigrams.keys():
                                    all_unigrams[token] = 1
                                else:
                                    all_unigrams[token] += 1

    top_unigrams = heapq.nlargest(1000, all_unigrams, key=all_unigrams.get)  # fetch top 1000 unigrams
    util.save_file(save_path, top_unigrams)
Example #5
0
def provide_top_spelling_errors():
    spelling_file_path = Path("vectors_handling/vectors/spelling_errors/top_spelling_errors.txt")

    if not util.exists(spelling_file_path):  # can't find the file in memory
        log('Cannot find top bipos file')  # redundant
        generate_top_spelling_errors(spelling_file_path)

    top_spelling_errors = util.load_file(spelling_file_path)
    return top_spelling_errors
Example #6
0
def provide_top_unigram():
    unigram_file_path = Path("vectors_handling/vectors/unigrams/top_unigrams.txt")

    if not util.exists(unigram_file_path): # can't find the file in memory
        log('Cannot find top unigrams file') # redundant
        generate_top_unigrams(unigram_file_path)

    top_unigrams = util.load_file(unigram_file_path)
    return top_unigrams
Example #7
0
def provide_top_tripos():
    tripos_file_path = Path("vectors_handling/vectors/pos/top_tripos.txt")

    if not util.exists(tripos_file_path):  # can't find the file in memory
        log('Cannot find top tripos file')  # redundant
        generate_top_tripos(tripos_file_path)

    top_tripos = util.load_file(tripos_file_path)
    return top_tripos
Example #8
0
def process_dir(domain_dir):
    users = []

    for country_dir in os.scandir(domain_dir):
        country_name = str.split(os.path.basename(country_dir), '.')[1]
        log('Generating users for ' + country_name)
        for user_dir in os.scandir(country_dir):
            users.append(process_user(user_dir))

    return users
Example #9
0
def generate(saving_path):
    log('Generating <' + setup.feature + ',' + setup.domain + '> user vectors')

    for domain_dir in os.scandir(setup.database):
        if domain_dir.name == 'europe_data' and setup.domain == 'in':
            users = process_dir(domain_dir)
        elif domain_dir.name == 'non_europe_data' and setup.domain == 'out':
            users = process_dir(domain_dir)

    util.save_file(saving_path, users)
Example #10
0
def generate_top_spelling_errors(save_path):
    log('Generating top spelling errors')
    all_spelling_errors = {}

    for domain_dir in os.scandir(setup.database):
        if domain_dir.name == 'europe_data' and setup.domain == 'in':

            for country_dir in os.scandir(domain_dir):
                country_name = str.split(os.path.basename(country_dir), '.')[1]
                log('Generating top spelling errors for ' + country_name)
                for user_dir in os.scandir(country_dir):
                    errors = []
                    for file_dir in os.scandir(user_dir):
                        file = open(file_dir, "r", encoding="utf-8")
                        lines = file.readlines()

                        for line in lines:  # parse lines within chunk text
                            json_data = json.loads(line)
                            for json_token in json_data:
                                if 'deletions' in json_token:
                                    if '[]' not in str(json_token['deletions']):  # not empty
                                        for component in json_token['deletions']:
                                            errors.append("del: " + component)

                                if 'insertions' in json_token:
                                    if '[]' not in str(json_token['insertions']):  # not empty
                                        for component in json_token['insertions']:
                                            errors.append("ins: " + component)

                                if 'replacements' in json_token:
                                    if '[]' not in str(json_token['replacements']):  # not empty
                                        for component in json_token['replacements']:
                                            errors.append("rep: " + str(component))

                    for error in errors:
                        if error not in all_spelling_errors.keys():
                            all_spelling_errors[error] = 1
                        else:
                            all_spelling_errors[error] += 1
    top_spelling_errors = heapq.nlargest(400, all_spelling_errors, key=all_spelling_errors.get)  # fetch top 400 spelling errors
    util.save_file(save_path, top_spelling_errors)
Example #11
0
def classify(users, countries, train_users=None, train_countries=None):
    log('Starting classification process')
    if setup.type == 'binary':
        clf = LogisticRegression(solver='saga',
                                 max_iter=setup.iterations,
                                 n_jobs=setup.threads,
                                 class_weight='balanced')
    elif setup.type in ['family', 'language']:
        clf = LogisticRegression(solver='lbfgs',
                                 max_iter=setup.iterations,
                                 multi_class='ovr',
                                 n_jobs=setup.threads,
                                 class_weight='balanced')

    if setup.domain == 'in':
        log('Starting 10-fold cross validation process')
        classifier_scores = cross_val_score(clf, users, countries, cv=10)
        score = np.average(classifier_scores)

    elif setup.domain == 'out':
        log('Starting fit&score process')
        clf_trained = clf.fit(train_users, train_countries)
        score = clf_trained.score(users, countries)

    return score