def provide_corpus_functionwords(numOfFunctionwords): global top_corpus_functionwords top_corpus_functionwords={} synchronized_functionwords_file_path = Path("vectors_handling/vectors/synchronized_functionwords/synchronized_functionwords.txt") if not util.exists(synchronized_functionwords_file_path): # can't find the file in memory log('Cannot find synchronized_functionwords file') # redundant corpus_functionwords = {} for domain_dir in os.scandir(setup.database): if domain_dir.name == 'europe_data' and setup.domain == 'in': for country_dir in os.scandir(domain_dir): country_name = str.split(os.path.basename(country_dir), '.')[1] log('Counting function words in ' + country_name) for user_dir in os.scandir(country_dir): for file_dir in os.scandir(user_dir): file = open(file_dir, "r", encoding="utf-8") lines = file.readlines() for line in lines: words = line.split() for word in words: if word in function_words_map.keys(): if word not in corpus_functionwords.keys(): corpus_functionwords[word] = 1 else: corpus_functionwords[word] += 1 top_corpus_functionwords = heapq.nlargest(numOfFunctionwords, corpus_functionwords, key=corpus_functionwords.get) util.save_file(synchronized_functionwords_file_path, top_corpus_functionwords) top_corpus_functionwords = util.load_file(synchronized_functionwords_file_path) return top_corpus_functionwords
def generate_top_bipos(save_path): log('Generating top bichars') all_bipos = {} for domain_dir in os.scandir(setup.database): if domain_dir.name == 'europe_data' and setup.domain == 'in': for country_dir in os.scandir(domain_dir): country_name = str.split(os.path.basename(country_dir), '.')[1] log('Generating top bipos for ' + country_name) for user_dir in os.scandir(country_dir): for file_dir in os.scandir(user_dir): file = open(file_dir, "r", encoding="utf-8") lines = file.readlines() for line in lines: # parse lines within chunk text pos_tokens = re.split("'\), \('|'\), \(\"", line) for i in range(len(pos_tokens) - 2): bigram = "" bigram = bigram + re.split("', '|\", '", pos_tokens[i])[1] + " " bigram = bigram + re.split("', '|\", '", pos_tokens[i + 1])[1] if bigram not in all_bipos.keys(): all_bipos[bigram] = 1 else: all_bipos[bigram] += 1 top_bipos = heapq.nlargest(300, all_bipos, key=all_bipos.get) # fetch top 300 bipos util.save_file(save_path, top_bipos)
def generate_top_trichars(save_path): log('Generating top trichars') all_trichars = {} for domain_dir in os.scandir(setup.database): if domain_dir.name == 'europe_data' and setup.domain == 'in': for country_dir in os.scandir(domain_dir): country_name = str.split(os.path.basename(country_dir), '.')[1] log('Generating top trichars for ' + country_name) for user_dir in os.scandir(country_dir): for file_dir in os.scandir(user_dir): file = open(file_dir, "r", encoding="utf-8") lines = file.readlines() for line in lines: # parse lines within chunk text if len(line) >= 11: cur_char = 0 while cur_char < len(line): trichar = line[cur_char + 1] + line[cur_char + 4] + line[cur_char + 7] if trichar not in all_trichars.keys(): all_trichars[trichar] = 1 else: all_trichars[trichar] += 1 cur_char += 11 top_trichars = heapq.nlargest(1000, all_trichars, key=all_trichars.get) # fetch top 1000 trichars util.save_file(save_path, top_trichars)
def generate_top_unigrams(save_path): log('Generating top unigrams') all_unigrams = {} for domain_dir in os.scandir(setup.database): if domain_dir.name == 'europe_data' and setup.domain == 'in': for country_dir in os.scandir(domain_dir): country_name = str.split(os.path.basename(country_dir), '.')[1] log('Generating top unigrams for ' + country_name) for user_dir in os.scandir(country_dir): for file_dir in os.scandir(user_dir): file = open(file_dir, "r", encoding="utf-8") lines = file.readlines() for line in lines: unigrams = line.split() for token in unigrams: if token not in all_unigrams.keys(): all_unigrams[token] = 1 else: all_unigrams[token] += 1 top_unigrams = heapq.nlargest(1000, all_unigrams, key=all_unigrams.get) # fetch top 1000 unigrams util.save_file(save_path, top_unigrams)
def provide_top_spelling_errors(): spelling_file_path = Path("vectors_handling/vectors/spelling_errors/top_spelling_errors.txt") if not util.exists(spelling_file_path): # can't find the file in memory log('Cannot find top bipos file') # redundant generate_top_spelling_errors(spelling_file_path) top_spelling_errors = util.load_file(spelling_file_path) return top_spelling_errors
def provide_top_unigram(): unigram_file_path = Path("vectors_handling/vectors/unigrams/top_unigrams.txt") if not util.exists(unigram_file_path): # can't find the file in memory log('Cannot find top unigrams file') # redundant generate_top_unigrams(unigram_file_path) top_unigrams = util.load_file(unigram_file_path) return top_unigrams
def provide_top_tripos(): tripos_file_path = Path("vectors_handling/vectors/pos/top_tripos.txt") if not util.exists(tripos_file_path): # can't find the file in memory log('Cannot find top tripos file') # redundant generate_top_tripos(tripos_file_path) top_tripos = util.load_file(tripos_file_path) return top_tripos
def process_dir(domain_dir): users = [] for country_dir in os.scandir(domain_dir): country_name = str.split(os.path.basename(country_dir), '.')[1] log('Generating users for ' + country_name) for user_dir in os.scandir(country_dir): users.append(process_user(user_dir)) return users
def generate(saving_path): log('Generating <' + setup.feature + ',' + setup.domain + '> user vectors') for domain_dir in os.scandir(setup.database): if domain_dir.name == 'europe_data' and setup.domain == 'in': users = process_dir(domain_dir) elif domain_dir.name == 'non_europe_data' and setup.domain == 'out': users = process_dir(domain_dir) util.save_file(saving_path, users)
def generate_top_spelling_errors(save_path): log('Generating top spelling errors') all_spelling_errors = {} for domain_dir in os.scandir(setup.database): if domain_dir.name == 'europe_data' and setup.domain == 'in': for country_dir in os.scandir(domain_dir): country_name = str.split(os.path.basename(country_dir), '.')[1] log('Generating top spelling errors for ' + country_name) for user_dir in os.scandir(country_dir): errors = [] for file_dir in os.scandir(user_dir): file = open(file_dir, "r", encoding="utf-8") lines = file.readlines() for line in lines: # parse lines within chunk text json_data = json.loads(line) for json_token in json_data: if 'deletions' in json_token: if '[]' not in str(json_token['deletions']): # not empty for component in json_token['deletions']: errors.append("del: " + component) if 'insertions' in json_token: if '[]' not in str(json_token['insertions']): # not empty for component in json_token['insertions']: errors.append("ins: " + component) if 'replacements' in json_token: if '[]' not in str(json_token['replacements']): # not empty for component in json_token['replacements']: errors.append("rep: " + str(component)) for error in errors: if error not in all_spelling_errors.keys(): all_spelling_errors[error] = 1 else: all_spelling_errors[error] += 1 top_spelling_errors = heapq.nlargest(400, all_spelling_errors, key=all_spelling_errors.get) # fetch top 400 spelling errors util.save_file(save_path, top_spelling_errors)
def classify(users, countries, train_users=None, train_countries=None): log('Starting classification process') if setup.type == 'binary': clf = LogisticRegression(solver='saga', max_iter=setup.iterations, n_jobs=setup.threads, class_weight='balanced') elif setup.type in ['family', 'language']: clf = LogisticRegression(solver='lbfgs', max_iter=setup.iterations, multi_class='ovr', n_jobs=setup.threads, class_weight='balanced') if setup.domain == 'in': log('Starting 10-fold cross validation process') classifier_scores = cross_val_score(clf, users, countries, cv=10) score = np.average(classifier_scores) elif setup.domain == 'out': log('Starting fit&score process') clf_trained = clf.fit(train_users, train_countries) score = clf_trained.score(users, countries) return score