Ejemplo n.º 1
0
 def get_tfidf(self, filename, file_tfidf):
     with open(r'D:\DS_lab\Project\PreProcessing\data\vocab.txt') as file:
         word_idfs = [(line.split(":")[0], float(line.split(":")[1]))
                      for line in line_tokenize(file.read())]
     idfs = dict(word_idfs)
     IDwords = dict(
         (word, index) for index, (word, idf) in enumerate(word_idfs))
     data = []
     with open(filename) as file:
         documents = [(line.split("_____")[0], line.split("_____")[1])
                      for line in line_tokenize(file.read())]
         for document in documents:
             words = [
                 w for w in document[1].split() if w in list(idfs.keys())
             ]
             set_of_words = list(set(words))
             sum_words = len(words)
             word_tfidfs = []
             sum_squares = 0
             for word in set_of_words:
                 tfidf = idfs[word] * words.count(word) / sum_words
                 sum_squares += tfidf**2
                 word_tfidfs.append((IDwords[word], tfidf))
             word_tfidfs_normalize = [
                 (str(index) + ":" + str(tfidf / np.sqrt(sum_squares)))
                 for index, tfidf in word_tfidfs
             ]
             sparse_data = " ".join(word_tfidfs_normalize)
             data.append("_____".join([document[0], sparse_data]))
     with open(file_tfidf, 'w') as file:
         file.write("\n".join(data))
Ejemplo n.º 2
0
def get_text(in_file):
    "returns the text included in in_file, all in lowercase"
    l_file = open(in_file, "r")
    raw = l_file.read().decode('utf8').lower()
    text = '\n'.join(nltk.line_tokenize(raw))

    return text
Ejemplo n.º 3
0
def date_change(str_):
    date_time =""
    from translate import Translator
    translator= Translator(from_lang='burmese',to_lang="en")
    
    import langid
    def lang_identifier_mm(text):
        if "en" == langid.classify(text)[0]:
            return False
        else:
            return True
    
    import nltk,datetime
    current_month = datetime.datetime.now().date().month
    from dateutil import parser
    keywords = ["naypyidaw","naypyitaw","nay","daw","pyi","taw"]
    for _line in nltk.line_tokenize(str_):
        if lang_identifier_mm(_line) == True:
            _line = translator.translate(_line)
            _line = _line.lower()
            for _key in keywords:
                if _key in nltk.word_tokenize(_line):
                    date_time = "2019 "+_line.replace(_key,"").replace(" on ","")
                    date_month = parser.parse(date_time).month
                    if date_month < current_month:
                        date_time = "2018 "+_line.replace(_key,"").replace(" on ","")
    return date_time
Ejemplo n.º 4
0
def kmeans_clustering(source_path, files_list, clusters):
    """
    Function to perform kmeans clustering on the dataset.

    Args:
        source_path -- string. Path where the data files are located.
        files_list -- list of file names in the source path
        clusters -- number of clusters
    """
    stopwords = nltk.line_tokenize(open('stopwords.txt').read())
    docs = []
    filename = {}
    i = 0

    # To avoid np array issues due to large datasets, perform clustering by splitting the data into smaller subsets
    for file_name in files_list[:2000]:
        resume = open(source_path + '/' + file_name).read()
        docs.append(resume)
        filename[i] = file_name
        i += 1

    vectorizer = TfidfVectorizer(min_df=1)
    tfidf = vectorizer.fit_transform(docs)
    km = KMeans(n_clusters=clusters, init='k-means++', max_iter=10, n_init=1)
    km.fit(tfidf)
    results = []

    # Create a results list with filename and predicted cluster value for each resume.
    for i in range(0, len(tfidf.toarray())):
        try:
            results.append(
                [str(filename[i]),
                 int(km.predict(tfidf.toarray()[i]))])
        except:
            pass

    # Copy the resume file from its source directory and move it to new directory according to its predicted cluster
    for cluster in range(0, clusters):
        # Create a combined resume text for each cluster for further analysis using word clouds
        word_cloud_text = ""
        for i in range(0, len(results) - 1):
            if results[i][1] == cluster:
                document = open(source_path + '/' + results[i][0], 'r').read()
                docu = re.sub('[^A-Za-z\' ]+', '', str(document).lower())
                unigrams = docu.split()
                word_list = [
                    word.lower() for word in unigrams
                    if word.lower() not in stopwords
                ]
                text = " ".join(word_list)
                word_cloud_text += text
                destination = '/Users/' + user_name + '/Documents/Data/kmeans/pass1/' + str(
                    cluster)
                shutil.copy2(source_path + '/' + results[i][0], destination)

        f = open(
            '/Users/' + user_name + '/Documents/Data/kmeans/wordcloud/pass1/' +
            str(cluster) + ".txt", 'w')
        f.write(word_cloud_text)
        f.close()
Ejemplo n.º 5
0
    def generate_vocabulary(self):
        self.min_df = 10

        def compute_idf(df, corpus_size):
            assert df > 0
            return np.log(corpus_size / df)

        with open(self.file) as file:
            data = file.read()
            lines = line_tokenize(data)
            corpus_size = len(lines)
            doc_count = defaultdict(int)
            for line in lines:
                components = line.split("_____")
                text = components[-1]
                features = list(set(text.split()))
                for w in features:
                    doc_count[w] += 1
            # words = list(doc_count.keys())
            # idfs = []
            # for word in words:
            #     if doc_count[word] > self.min_df:
            #         idf = compute_idf(doc_count[word], corpus_size)
            #         idfs.append(idf)
            #     else:
            #         words.remove(word)
            # vocab = zip(words, idfs)
            vocab = [(word, compute_idf(doc_count[word], corpus_size))
                     for word in list(doc_count.keys())
                     if doc_count[word] > self.min_df]
            feature_idfs = []
            for (feature, idf) in vocab:
                feature_idfs.append(feature + ":" + str(idf))
            with open('data\\vocab.txt', 'w') as file:
                file.write("\n".join(feature_idfs))
Ejemplo n.º 6
0
def populate(keyfile, textfile):
    #imatrix=copy.deepcopy(amatrix)
    #itext=text.tokens[:]
    print("populate : ", keyfile)
    f = open('Keys/' + keyfile)
    raw = f.read().lower()
    #tokens1 = nltk.word_tokenize(raw)
    tokens1 = nltk.line_tokenize(raw)
    text11 = removelinebreak(tokens1)
    tokens1 = text11
    text1 = Text(raw)
    text1.tokens = tokens1
    #print(tokens1)
    #text.updatetokens(tokens1)
    #text.tokens.append(textfile)
    #print(text.tokens)
    if len(text1.tokens) > 0:
        poslist = text1.update_graph('Corpus/' + textfile)
        text.updatetokens(text1.tokens)
        #print ("after: ",text.tokens)
        amatrix.updateMatrix(text.tokens, poslist, False)
        text.tokens.sort()
    print(amatrix)
    rem = []
    for i in text.tokens:
        if i[len(i) - 1] == "s" and i[:-1] in text.tokens:
            t = i[:-1]
            for j in text.tokens:
                if amatrix.gmatrix[j][i].weight != float('inf'):
                    if amatrix.gmatrix[j][t].weight != float('inf'):
                        amatrix.gmatrix[j][t].weight += amatrix.gmatrix[j][
                            i].weight
                        amatrix.gmatrix[j][t].numupdate += amatrix.gmatrix[j][
                            i].numupdate
                        amatrix.gmatrix[t][j].weight = amatrix.gmatrix[j][
                            t].weight
                        amatrix.gmatrix[t][j].numupdate = amatrix.gmatrix[j][
                            t].numupdate
                    else:
                        amatrix.gmatrix[j][t].weight = amatrix.gmatrix[j][
                            i].weight
                        amatrix.gmatrix[j][t].numupdate = amatrix.gmatrix[j][
                            i].numupdate
                        amatrix.gmatrix[t][j].weight = amatrix.gmatrix[j][
                            t].weight
                        amatrix.gmatrix[t][j].numupdate = amatrix.gmatrix[j][
                            t].numupdate
            print("pop remove: ", i)
            rem.append(i)
    for i in rem:
        for j in text.tokens:
            del amatrix.gmatrix[j][i]
    for i in rem:
        print(i, "hi")
        del amatrix.gmatrix[i]
        text.tokens.remove(i)
Ejemplo n.º 7
0
def get_cosine(block):
    lines = nltk.line_tokenize(block)

    l = [
        l.strip('cosine similarity between vectors: ') for l in lines
        if l.startswith('cosine')
    ]
    if len(l) == 0:
        return 0
    else:
        return l[0]
Ejemplo n.º 8
0
def kmeans_clustering(source_path, files_list, clusters):
    """
    Function to perform kmeans clustering on the dataset.

    Args:
        source_path -- string. Path where the data files are located.
        files_list -- list of file names in the source path
        clusters -- number of clusters
    """
    stopwords = nltk.line_tokenize(open('stopwords.txt').read())
    docs = []
    filename = {}
    i = 0

    # To avoid np array issues due to large datasets, perform clustering by splitting the data into smaller subsets
    for file_name in files_list[:2000]:
        resume = open(source_path + '/' + file_name).read()
        docs.append(resume)
        filename[i] = file_name
        i += 1

    vectorizer = TfidfVectorizer(min_df=1)
    tfidf = vectorizer.fit_transform(docs)
    km = KMeans(n_clusters=clusters, init='k-means++', max_iter=10, n_init=1)
    km.fit(tfidf)
    results = []

    # Create a results list with filename and predicted cluster value for each resume.
    for i in range(0, len(tfidf.toarray())):
        try:
            results.append([str(filename[i]), int(km.predict(tfidf.toarray()[i]))])
        except:
            pass

    # Copy the resume file from its source directory and move it to new directory according to its predicted cluster
    for cluster in range(0, clusters):
        # Create a combined resume text for each cluster for further analysis using word clouds
        word_cloud_text = ""
        for i in range(0, len(results)-1):
            if results[i][1] == cluster:
                document = open(source_path + '/' + results[i][0], 'r').read()
                docu = re.sub('[^A-Za-z\' ]+', '', str(document).lower())
                unigrams = docu.split()
                word_list = [word.lower() for word in unigrams if word.lower() not in stopwords]
                text = " ".join(word_list)
                word_cloud_text += text
                destination = '/Users/' + user_name + '/Documents/Data/kmeans/pass1/' + str(cluster)
                shutil.copy2(source_path + '/' + results[i][0], destination)


        f = open('/Users/' + user_name + '/Documents/Data/kmeans/wordcloud/pass1/' + str(cluster) + ".txt", 'w')
        f.write(word_cloud_text)
        f.close()
Ejemplo n.º 9
0
def extract_from_file(tagger, files, labels, file_id, keep):
    extracted_lines = []
    extracted_labels = []
    extract_from = []

    file = files[file_id]
    if labels:
        label = labels[file_id]
        label = list(label)

    count = 0
    i = j = 0
    lines = nltk.line_tokenize(file)

    start = None
    count = 0

    found_list = [contains_keywords.remote(line, tagger) for line in lines]
    found_list = ray.get(found_list)
    while i < len(lines):
        print(i)
        line = lines[i]
        if found_list[i]:
            count = 0
            if not start:
                start = i
        else:
            count += 1

        if start is not None:
            if count == keep or i == len(lines) - 1:
                start = max(start - keep, 0)
                end = min(i, len(lines) - 1)
                #                         print(end)
                new_extracted = '\n'.join(lines[start:end + 1])
                extracted_lines.append(new_extracted)
                if labels:
                    extracted_labels.append(label[start:end + 1])
                extract_from.append((file_id, start))
                start = None
                count = 0
        i += 1
    return extracted_lines, extracted_labels, extract_from
Ejemplo n.º 10
0
def initgraph(keyfile, textfile):
    f = open(keyfile)
    raw = f.read().lower()
    global text
    text = Text(raw)
    #tokens = nltk.word_tokenize(raw)
    tokens = nltk.line_tokenize(raw)
    text.tokens = list(set(tokens))
    text1 = removelinebreak(text.tokens)
    #print text1
    text1.sort()
    #text1.append(textfile)
    text.tokens = text1
    #print text.tokens
    #print len(text.tokens)
    global amatrix
    #print ("before: ",text.tokens)
    amatrix = text.create_graph(textfile)
    #print("init:",len(text.tokens))
    #amatrix.draw_ind_png(text.tokens,text.tokens,textfile)
    #print len(amatrix.gmatrix)
    return text.tokens
from __future__ import division
import os
import re
import nltk
import random
import string
import pickle
from lxml import etree
from nltk import bigrams
from nltk import FreqDist
from collections import Counter
from util import ResumeCorpus

user_name = os.environ.get('USER')
punct = string.punctuation
stopwords = nltk.line_tokenize(open('stopwords.txt').read())
porter = nltk.PorterStemmer()


def create_skills_json(training_data):
    """
    This function will extract all the skills from the training corpus and create a dictionary with Job Titles as
    keys and list of all the skills for that Job Title as values

    Args:
        training_data -- list of tuples. Eg. [(resume, tag, filename), (resume, tag, filename)...]

    Returns:
        skills_dict -- A dictionary with Job Titles as keys and list of all the skills for that Job Title as values
    """
Ejemplo n.º 12
0
def get_line_breaks(text):
    # uphold line breaks
    lines = nltk.line_tokenize(text, blanklines='keep')
    snippets = [len(tokenizer.tokenize(line)) for line in lines[:-1]]
    breaks = np.array(snippets).cumsum() + np.arange(len(snippets))
    return breaks
Ejemplo n.º 13
0
import nltk
import os
import json
import pickle

# assign input path and file name
inpath = 'org_texts/'
infile = inpath + 'tasis_faq.txt'
# assign empty dict
d = {}
# open txt file
with open(infile) as f:
    # read file
    trial = f.read()
    # tokenize by line
    tokenize = nltk.line_tokenize(trial)
    # even numbers(questions) assign to key, odd (answers) to value in dict
    for i in range(0, len(tokenize), 2):
        d[tokenize[i]] = tokenize[i + 1]

# assign output path and filename
outpath = 'dict_texts/'
outname = outpath + 'tasis_faq_dict'
# write outfile
outfile = open(outname, 'wb')
pickle.dump(d, outfile)
outfile.close()
Ejemplo n.º 14
0
Returns lists of tokens, tags, and items, then 
writes them to text files.
'''

import nltk

# Load text file
data=open("tpb_sopa.txt")
    # OUT: <type 'file'>

# Read file as string
input=data.read()
    # OUT: <type 'str'>

# Parse text lines into tokens
lines=nltk.line_tokenize(input)
# Write list of lines to disk
FILE=open("out/lines.txt","w")
for line in lines:
    FILE.writelines(str(line)+'\n')
FILE.close()
print "lines: "+str(len(lines))


# Parse sentences into tokens
sentences=nltk.sent_tokenize(input)
# Write list of sentences to disk
FILE=open("out/sentences.txt","w")
for sentence in sentences:
    FILE.writelines(str(sentence))
FILE.close()
from __future__ import division
import os
import re
import nltk
import random
import string
import pickle
from lxml import etree
from nltk import bigrams
from nltk import FreqDist
from collections import Counter
from util import ResumeCorpus

user_name = os.environ.get('USER')
punct = string.punctuation
stopwords = nltk.line_tokenize(open('stopwords.txt').read())
porter = nltk.PorterStemmer()


def create_skills_json(training_data):
    """
    This function will extract all the skills from the training corpus and create a dictionary with Job Titles as
    keys and list of all the skills for that Job Title as values

    Args:
        training_data -- list of tuples. Eg. [(resume, tag, filename), (resume, tag, filename)...]

    Returns:
        skills_dict -- A dictionary with Job Titles as keys and list of all the skills for that Job Title as values
    """