def get_test_feature(text_file, standardize_file, output_file):

    param = json.loads(open(standardize_file, 'r').read())
    text = load_text(text_file)

    labels = [0] * len(text)
    feat = [ ( {'name':'label', 'type':'{0, 1}'}, labels ) ]
    '''
        Call functions to extract features and add to data.
    '''
    feat += avg_sen_len(text)
    feat += w2v_sim(text)
    feat += ngram(text, './models/3.binlm', 'tri')
    feat += ngram(text, './models/4.binlm', 'quad')

    # text = load_raw_text(text_file)
    # pos_feat = extract_pos_feat(text)
    # with open('temp_pos', 'w') as f:
    #     for line in pos_feat:
    #         f.write(line)
    # pos_labels = load_text('temp_pos')
    # feat += ngram(pos_labels, './models/pos3.binlm', 'pos-tri')
    # feat += ngram(pos_labels, './models/pos4.binlm', 'pos-quad')

    '''
        Output the libsvm file.
    '''
    arff_dump('temp_feat.arff', feat, param=param)
    data, label = load_arff('temp_feat.arff')
    to_libsvm(data, label, output_file)
Beispiel #2
0
def get_test_feature(text_file, standardize_file, output_file):

    param = json.loads(open(standardize_file, 'r').read())
    text = load_text(text_file)

    labels = [0] * len(text)
    feat = [({'name': 'label', 'type': '{0, 1}'}, labels)]
    '''
        Call functions to extract features and add to data.
    '''
    feat += avg_sen_len(text)
    feat += w2v_sim(text)
    feat += ngram(text, './models/3.binlm', 'tri')
    feat += ngram(text, './models/4.binlm', 'quad')

    # text = load_raw_text(text_file)
    # pos_feat = extract_pos_feat(text)
    # with open('temp_pos', 'w') as f:
    #     for line in pos_feat:
    #         f.write(line)
    # pos_labels = load_text('temp_pos')
    # feat += ngram(pos_labels, './models/pos3.binlm', 'pos-tri')
    # feat += ngram(pos_labels, './models/pos4.binlm', 'pos-quad')
    '''
        Output the libsvm file.
    '''
    arff_dump('temp_feat.arff', feat, param=param)
    data, label = load_arff('temp_feat.arff')
    to_libsvm(data, label, output_file)
Beispiel #3
0
def run(train,nmax,reps,out):
	#Get probabilities with arbitrary precision
	fh = open(train)
	ngram.set_fractions(True)
	probs_ap=ngram.probabilities(ngram.good_turing(ngram.ngram(nmax,filters.unk(filters.shakespeare(fh)))))
	
	#Get probabilities with logs
	fh = open(train)
	ngram.set_fractions(False)
	probs_log=ngram.probabilities(ngram.good_turing(ngram.ngram(nmax,filters.unk(filters.shakespeare(fh)))))
	
	#Make sentences
	sentence_generation(train,out,nmax,reps,probs_ap,probs_log)
Beispiel #4
0
def ngram_plot(values, top = 10):
    """
    Plot an ngram

    :param values: Is either a string or an array containing strings
    :param top: Is the number of results you want returned (optional, defaults to 10)
    :return A matplotlib of the actual n gram

    e.g.

       ngram_plot(A1:10)
    """
    dict = ngram(values)

    max_freq = dict[0][1] #the max_frequency is the value of the first word
    words = []
    freq = []
    for key, val in dict: #add words to matplotlib at random coordinates with font size relative to max frequency, rotate every other word
        words.append(key)
        y_pos = np.arange(len(words))
        freq.append(val)

        plt.bar(y_pos, freq, align='center')
        plt.xticks(y_pos, words)
        plt.ylabel('Frequency')
        plt.title('Common Ngrams')
        top -= 1
        if top == 0:
            break
    return plt
Beispiel #5
0
 def add_model(self, model_name):
     self.model_results[model_name] = {}
     #self.model_results[model_name]["batch count"] = 0
     self.model_results[model_name]["count"] = 0
     self.model_results[model_name]["BLEU_count"] = 0
     self.model_results[model_name]["BLEU_count_arr"] = []
     self.model_results[model_name]["BLEU"] = 0
     self.model_results[model_name]["BLEU_arr"] = []
     self.model_results[model_name]["context_len_arr"] = []
     self.model_results[model_name]["context BLEU"] = 0
     self.model_results[model_name]["gen_list_temp"] = []
     self.model_results[model_name]["self BLEU count"] = 0
     self.model_results[model_name]["self BLEU"] = 0
     self.model_results[model_name]["self BLEU arr"] = []
     self.model_results[model_name]["self BLEU count arr"] = []
     self.model_results[model_name]["perplexity"] = 0
     self.model_results[model_name]["ngram"] = ngram()
     self.model_results[model_name]["ngram count"] = 0
     self.model_results[model_name]["ngram count arr"] = []
     self.model_results[model_name]["unigram"] = 0
     self.model_results[model_name]["bigram"] = 0
     self.model_results[model_name]["unigram arr"] = []
     self.model_results[model_name]["bigram arr"] = []
     self.model_results[model_name]["time_sum"] = 0
     self.model_results[model_name]["time_count"] = 0
Beispiel #6
0
def checkPlagiarism(file):
    isPlagiarized = False
    grams = fileOpen(file)
    ngrams = ngram(grams, 9)
    ngrams = [' '.join(i) for i in ngrams]

    for i in range(len(ngrams)):
        driver = webdriver.Firefox()
        toSearch = ngrams[i].encode('utf-8')
        driver.get("http://google.com")
        search = driver.find_element_by_name('q')
        search.send_keys(ngrams[i])
        search.send_keys(Keys.RETURN)

        googleResult = googleSearch(toSearch)
        search = driver.find_element_by_name('q')
        search.send_keys(ngrams[i])
        search.send_keys(Keys.RETURN)

        for result in googleResult:
            similarity = getSimilarity(toSearch,
                                       strip_tags(result.description))
            if similarity >= 70:
                print("This file was plagiarized!")
                isPlagiarized = True
                driver.quit()
                return isPlagiarized

        driver.quit()

    print("This file is original with no evidence of plagiarism.")
    return isPlagiarized
Beispiel #7
0
 def add_model(self, model_name):
     self.model_results[model_name] = {}
     self.model_results[model_name]["batch count"] = 0
     self.model_results[model_name]["count"] = 0
     self.model_results[model_name]["BLEU_count"] = 0
     self.model_results[model_name]["BLEU_count_arr"] = []
     self.model_results[model_name]["BLEU"] = 0
     self.model_results[model_name]["BLEU_arr"] = []
     self.model_results[model_name]["context_len_arr"] = []
     self.model_results[model_name]["context BLEU"] = 0
     self.model_results[model_name]["gen_list_temp"] = []
     self.model_results[model_name]["self BLEU count"] = 0
     self.model_results[model_name]["self BLEU"] = 0
     self.model_results[model_name]["token hit"] = 0
     self.model_results[model_name]["word type hit"] = 0
     self.model_results[model_name]["topic hit"] = 0
     self.model_results[model_name]["exact token hit"] = 0
     self.model_results[model_name]["exact word type hit"] = 0
     self.model_results[model_name]["exact topic hit"] = 0
     self.model_results[model_name]["perplexity"] = 0
     self.model_results[model_name]["ngram"] = ngram()
     self.model_results[model_name]["unigram"] = 0
     self.model_results[model_name]["bigram"] = 0
     self.model_results[model_name]["time_sum"] = 0
     self.model_results[model_name]["time_count"] = 0
Beispiel #8
0
 def renew_ngram(self):
     for model_name, model in self.model_results.items():
         model["batch count"] += 1
         unigram, bigram = model["ngram"].diversity_n()
         #print(model["ngram"].diversity_n())
         model["unigram"] += unigram
         model["bigram"] += bigram
         model["ngram"] = ngram()
Beispiel #9
0
Datei: 05.py Projekt: tkyf/nlp100
def main():
    import ngram

    sentence = 'I am an NLPer'

    word_2gram = ngram.word_ngram(sentence, 2)
    char_2gram = ngram.ngram(sentence, 2)

    print(word_2gram)
    print(char_2gram)
Beispiel #10
0
 def renew_ngram(self, head_idx):
     for model_name, model in self.model_results.items():
         model["ngram count"] += 1
         model["ngram count arr"][head_idx] += 1
         unigram, bigram = model["ngram"].diversity_n()
         #print(model["ngram"].diversity_n())
         model["unigram"] += unigram
         model["bigram"] += bigram
         model["unigram arr"][head_idx] += unigram
         model["bigram arr"][head_idx] += bigram
         model["ngram"] = ngram()
Beispiel #11
0
def getwords(doc):
    menus = [
        "备孕","怀孕","产后",
        "发烧","胎教","母胎",
        "疾病","营养","护理",
        "疾病","孕妇","孕妈",
        "生育",
    ]
    tg = ngram(menus,min_sim=0.0)
    words = tg.getSimilarStrings(doc.encode("utf8")).keys()
    return dict([(w,1) for w in words])
Beispiel #12
0
Datei: 06.py Projekt: tkyf/nlp100
def main():
    import ngram
    sentecne1 = 'paraparaparadise'
    sentecne2 = 'paragraph'

    X = set(ngram.ngram(sentecne1, 2))
    Y = set(ngram.ngram(sentecne2, 2))

    XYunion = X | Y
    XYintersection = X & Y
    XYdiference = X - Y
    print(X)
    print(Y)
    print(XYunion)
    print(XYintersection)
    print(XYdiference)

    if 'se' in X and 'se' in Y:
        print("'se' is in X and Y")
    else:
        print("'se' is not in X and Y")
Beispiel #13
0
def makengramtext(n, fname):
    fngram = fname[:-4] + '_{}-gram.txt'.format(n)

    inittext(fngram)
    with open(fname, mode='r', encoding='utf-8') as f:
        for line in tqdm.tqdm(f):
            line = line.rstrip()
            line = re.sub(' ', 'R', line)
            ngram_list = ngram.ngram(line, n)

            with open(fngram, mode='a') as fn:
                fn.write(' '.join(ngram_list) + '\n')
Beispiel #14
0
def compareAuthors(authors, compareDict, tg_dict):

    # this dict contains how many texts (that we are comparing against the corpus) each author has written
    authorMade = {}
    
    # The dict that contains which authors have had their text attributed to whom
    resultDict = {}        
            
    for value in compareDict:
        textToCompare = value["text"]
        realAuthor = value["user_id"]
        
        # Compare value
        compareTo = [textToCompare]
        com = ngram.ngram(compareTo)
        
        (gram, workList) = com.total_ngram(compareTo)
        dataDist = {}

        # We do the actual testing        
        for author in authors:
            sum = 0.0
            tg = tg_dict[author]
            for word in workList:
                sum += tg.propability(word, 0)
            
            dataDist[author] = sum
        
        value = -1
        for an in dataDist.keys():
            newValue = dataDist[an]
            if newValue > value:
                value = newValue 
                author = an
                
        print "Real author:", realAuthor
        print "Most likely author:", author

        if not authorMade.has_key(author):
            authorMade[author] = [realAuthor]
        else:
            authorMade[author].append(realAuthor)
            
        # We take score of the attributions
        if not resultDict.has_key(realAuthor) :
            resultDict[realAuthor] = {author: 1}
        elif not resultDict[realAuthor].has_key(author):
            resultDict[realAuthor][author] = 1
        else:
            resultDict[realAuthor][author] += 1
    return (authorMade, resultDict)    
Beispiel #15
0
def makeNgram(filename):
    worker = JSON.workOnJSON()
    dict = worker.read_JSON_file(filename)
    authorDict = {}
    authorWrittenDict = {}
    tg_dict = {}
    authorNameDirec = {}
    num = 0

    for entry in dict:
        author = entry["user_id"]
        id = entry["post_id"]
         
        value = {"user_id": author, "text": entry["text"]}
        
        if authorDict.has_key(author):
            authorDict[author].append(value)
            authorWrittenDict[author].append(id)
        else:
            authorDict[author] = [value]
            authorWrittenDict[author] = [id]
    
    newAuthorDict = {}
    authorTexts = {}
    
    for authorName in authorDict.keys():
        author = authorDict[authorName]
        newAuthorDictTemp = {}
        
        listOfEntries = [entry["text"] for entry in author]
        newAuthorDict[authorName] = listOfEntries
        text = ''.join(listOfEntries)
        
        tg = ngram.ngram(listOfEntries)
        tg.corp = text
        tg.newRemember()
        tg_dict[authorName] = tg
            
    return (newAuthorDict, tg_dict)
Beispiel #16
0
    def __init__(self, corpus, n):
        # corpus, 训练标注器的语料, 格式为 [[('Hello', 'NNP'), ('world', 'NN'), ('!', '.')], [...], ...]
        # n - 语言模型 n-gram 中的 n

        # 定义词性标注任务
        # 1. transition 为 n-gram 模型
        # 2. emission 为 P( pos |Word )
        # 3. initial distribution 为 P('START') = 1.0

        # 预处理词库,给每句话加上开始和结束符号
        brown_tags_words = []
        for sent in corpus:
            brown_tags_words.append(('START', 'START'))
            brown_tags_words.extend([(tag[:2], word) for word, tag in sent])
            brown_tags_words.append(('END', 'END'))

        # 从语料集获得 emission - 统计条件概率
        cfd_tagwords = ConditionalFreqDist(brown_tags_words)
        # P(W = word, condition = pos)
        cpd_tagwords = ConditionalProbDist(cfd_tagwords, MLEProbDist)
        emission = {
            tag:
            {word: cpd_tagwords[tag].prob(word)
             for word in cfd_tagwords[tag]}
            for tag in cpd_tagwords
        }

        # 从语料集获得 transition - 调用 n-gram 模型
        tags = [[tag for _, tag in sent] for sent in corpus]
        transition = Transition(ngram(tags, n))

        # 定义 initial distribution - 以 START 为句首, 概率为 1
        initial_distribution = {('START', ): 1.0}

        # 定义 词性标注器
        HMM.__init__(self, initial_distribution, transition, emission, n)
Beispiel #17
0
def nlist(arr, n=2):
    nlist = []
    for text in arr:
        nlist.extend(ngram(text, n))
    return nlist
Beispiel #18
0
async def mimic(ctx, word, num_words=0):
    """Mimics Owen given a starting word."""
    sentence = ngram.ngram(counts, word, num_words)
    await ctx.send(sentence)
Beispiel #19
0
	def __init__(self,fileName):
		self.sList = readXml(fileName)
		self.n = ngram.ngram()
		self.usedDist = 1;
Beispiel #20
0
def differenceSet(a,b):
    product = productSet(a,b)
    result = []
    for w in a:
        if w not in product:
            result.append(w)

    return result

if __name__ == '__main__':
    from ngram import ngram
    sentence1 = "paraparaparadise"
    sentence2 = "paragraph"
    
    X = ngram(2, sentence=sentence1, mode='char')
    Y = ngram(2, sentence=sentence2, mode='char')
    
    union = unionSet(X, Y)
    product = productSet(X, Y)
    difference1 = differenceSet(X, Y)
    difference2 = differenceSet(Y, X)
    
    print("unionSet is {0}".format(union))
    print("productSet is {0}".format(product))
    print("differenceSet X-Y is {0}".format(difference1))
    print("differenceSet Y-X is {0}".format(difference2))

    print("'se' is in X : {0}".format(str('se' in X)))
    print("'se' is in Y : {0}".format(str('se' in Y)))
import ngram
import re
import string
from preprocess import *
from stemming.porter2 import stem
import nltk
import numpy as np

if __name__ == "__main__":
    model = ngram.ngram(3)
    # Train the model on text file preprocessed
    with open("preprocessed/train_set.txt", 'r') as train_set:
        model.train(train_set.read())

    with open("Holmes.machine_format.questions.txt", 'r') as questions_machine, \
    open("preprocessed/questions.txt", 'r') as questions_set, \
    open("Holmes.machine_format.answers.txt", 'r') as ans_machine, \
    open("preprocessed/answers.txt", 'r') as answers_set:
        # prepare questions for preprocessing
        questions = questions_machine.read().split('\n')

        # format answers for easier comparison
        answers = ans_machine.read().split('\n')

        questions_set = questions_set.read().split('\n')
        answers_set = answers_set.read().split('\n')

        i = 0
        sentences = []
        words = ""
        rights = 0
Beispiel #22
0
def getModels():
    tru_unigram = ngram.ngram('true.train', 1, ngram.Smooth.GOOD_TURING, True)
    fal_unigram = ngram.ngram('false.train', 1, ngram.Smooth.GOOD_TURING, True)
    
    tru_uni_rl = ngram.ngram('true.train', 1, ngram.Smooth.GOOD_TURING, True, ngram.Direction.RL)
    fal_uni_rl = ngram.ngram('false.train', 1, ngram.Smooth.GOOD_TURING, True, ngram.Direction.RL)

    tru_bigram = ngram.ngram('true.train', 2, ngram.Smooth.GOOD_TURING, True)       
    fal_bigram = ngram.ngram('false.train', 2, ngram.Smooth.GOOD_TURING, True)

    tru_bi_rl = ngram.ngram('true.train', 2, ngram.Smooth.GOOD_TURING, True, ngram.Direction.RL)
    fal_bi_rl = ngram.ngram('false.train', 2, ngram.Smooth.GOOD_TURING, True, ngram.Direction.RL)

    #trial code
    tru_trigram = ngram.ngram('true.train', 3, ngram.Smooth.GOOD_TURING, True)
    fal_trigram = ngram.ngram('false.train', 3, ngram.Smooth.GOOD_TURING, True)

    tru_tri_rl = ngram.ngram('true.train', 3, ngram.Smooth.GOOD_TURING, True, ngram.Direction.RL)
    fal_tri_rl = ngram.ngram('false.train', 3, ngram.Smooth.GOOD_TURING, True, ngram.Direction.RL)

    tru_quadgram = ngram.ngram('true.train', 4, ngram.Smooth.GOOD_TURING, True)
    fal_quadgram = ngram.ngram('false.train', 4, ngram.Smooth.GOOD_TURING, True)

    return [tru_unigram, fal_unigram, tru_bigram, fal_bigram, tru_trigram, fal_trigram, tru_quadgram, fal_quadgram
    , tru_uni_rl, fal_uni_rl, tru_bi_rl, fal_bi_rl, tru_tri_rl, fal_tri_rl]
Beispiel #23
0
    def train(self, tokens):
        bigram = ngram(tokens)

        for s1, s2 in bigram:
            self.chains.setdefault(s1, [])
            self.chains[s1].append(s2)
Beispiel #24
0
import ngram

str1 = "paraparaparadise"
str2 = "paragraph"
tool = ngram.ngram()
list1 = tool.literaln(str1, 2)
list2 = tool.literaln(str2, 2)
Beispiel #25
0
    def train(self, tokens):
        bigram = ngram(tokens)

        for s1, s2 in bigram:
            self.chains.setdefault(s1, [])
            self.chains[s1].append(s2)
Beispiel #26
0
# Call rate Labeling
import ngram
import pandas as pd

ng = ngram.ngram()
fl = ng.search(r'C:\Users\student\nlp_project\bok_project\ngram\ngram_data')

for doc_num in range(len(fl)):
    datas = ng.select_file(doc_num)
    if doc_num == 0:
        ngram_df = datas[['date', 'ngram']]
    else:
        ngram_df = pd.concat([ngram_df, datas[['date', 'ngram']]])

ngram_df.reset_index()[['date', 'ngram']].to_json('total_ngram.json')
 def __ngram(x):
     if skipgram:
         return skip_bigram(x, max_step=skipgram)
     if n > 1:
         return ngram(x, n=n)
     return x
Beispiel #28
0
import ngram

str1 = "paraparaparadise"
str2 = "paragraph"
tool = ngram.ngram()
list1 = tool.literaln(str1,2)
list2 = tool.literaln(str2,2)
Beispiel #29
0
def nlist(arr, n=2):
    nlist = []
    for text in arr:
        nlist.extend(ngram(text, n))
    return nlist
Beispiel #30
0
	def __init__(self):
		# Conversion of some turkish language specific characters
		self.charList = {"\x80":"c",
			"\x8e":"a",
			"\x99":"o",
			"\xa3":"u",
			"\x81":"",
			"\x89":"",
			"\x93":"",
			"\x94":"",
			"\xa1":"i",
			"\xad":"",
			"\xb0":"",
			"\xb1":"",
			"\xb3":"",
			"\xba":"",
			"\xe3":"",
			"\xe4":"a",
			chr(0xe2):"o", # 0xe2 karakterini silemiyorum!!! :S
			"é":"e",
			"ğ":"g",
			"ü":"u",
			"ş":"s",
			"ı":"i",
			"ö":"o",
			"ç":"c",
			"Ğ":"G",
			"Ü":"U",
			"Ş":"S",
			"İ":"I",
			"Ö":"O",
			"Ç":"C",
			"!":"",
			",":"",
			".":"",
			";":"",
			":":"",
			")":"",
			"(":"",
			"'":"",
			'"':"",
			"1":"",
			"2":"",
			"3":"",
			"4":"",
			"5":"",
			"6":"",
			"7":"",
			"8":"",
			"9":"",
			"0":"",
			"-":"",
			"_":"",
			"?":"",
			"%":"",
			"$":"",
			"&":"",
			"/":"",
		    "\\":""} #This set is not complete!!
		self.n = ngram.ngram()
		self.feedList = []
Beispiel #31
0
 def __init__(self):
     # Conversion of some turkish language specific characters
     self.charList = {
         "\x80": "c",
         "\x8e": "a",
         "\x99": "o",
         "\xa3": "u",
         "\x81": "",
         "\x89": "",
         "\x93": "",
         "\x94": "",
         "\xa1": "i",
         "\xad": "",
         "\xb0": "",
         "\xb1": "",
         "\xb3": "",
         "\xba": "",
         "\xe3": "",
         "\xe4": "a",
         chr(0xe2): "o",  # 0xe2 karakterini silemiyorum!!! :S
         "é": "e",
         "ğ": "g",
         "ü": "u",
         "ş": "s",
         "ı": "i",
         "ö": "o",
         "ç": "c",
         "Ğ": "G",
         "Ü": "U",
         "Ş": "S",
         "İ": "I",
         "Ö": "O",
         "Ç": "C",
         "!": "",
         ",": "",
         ".": "",
         ";": "",
         ":": "",
         ")": "",
         "(": "",
         "'": "",
         '"': "",
         "1": "",
         "2": "",
         "3": "",
         "4": "",
         "5": "",
         "6": "",
         "7": "",
         "8": "",
         "9": "",
         "0": "",
         "-": "",
         "_": "",
         "?": "",
         "%": "",
         "$": "",
         "&": "",
         "/": "",
         "\\": ""
     }  #This set is not complete!!
     self.n = ngram.ngram()
     self.feedList = []
    param = None
    if len(sys.argv) > 4:
        standardize_file = sys.argv[4]
        param = json.loads(open(standardize_file, 'r').read())

    text = load_text(text_file)
    labels = [int(line.strip()) for line in open(label_file, 'r')]
    data = [ ( {'name':'label', 'type':'{0, 1}'}, labels ) ]
    assert len(labels) == len(text)

    '''
        Call functions to extract features and add to data.
    '''
    data += avg_sen_len(text)
    data += w2v_sim(text)
    data += ngram(text, './models/3.binlm', 'tri')
    data += ngram(text, './models/4.binlm', 'quad')

    #print pos_file
    #pos_labels = load_text(pos_file)
    #data += ngram(pos_labels, './models/pos3.binlm', 'pos-tri')
    #data += ngram(pos_labels, './models/pos4.binlm', 'pos-quad')

    '''
        Output the arff file.
    '''
    param = arff_dump(output_file, data, param=param)

    if len(sys.argv) <= 4:
        f = open('param.json', 'w')
        f.write(json.dumps(param, indent = 2))
import ngram
import re
import string
from preprocess import *
from stemming.porter2 import stem
import nltk
import numpy as np

if __name__ == "__main__":
    model = ngram.ngram(3)
    # Train the model on text file preprocessed
    with open("preprocessed/train_set.txt", 'r') as train_set:
        model.train(train_set.read())

    with open("Holmes.machine_format.questions.txt", 'r') as questions_machine, \
    open("preprocessed/questions.txt", 'r') as questions_set, \
    open("Holmes.machine_format.answers.txt", 'r') as ans_machine, \
    open("preprocessed/answers.txt", 'r') as answers_set:
        # prepare questions for preprocessing
        questions = questions_machine.read().split('\n')

        # format answers for easier comparison
        answers = ans_machine.read().split('\n')

        questions_set = questions_set.read().split('\n')
        answers_set = answers_set.read().split('\n')

        i = 0
        sentences = []
        words = ""
        rights = 0
Beispiel #34
0
    pos_file = './data/' + pos_file
    param = None
    if len(sys.argv) > 4:
        standardize_file = sys.argv[4]
        param = json.loads(open(standardize_file, 'r').read())

    text = load_text(text_file)
    labels = [int(line.strip()) for line in open(label_file, 'r')]
    data = [({'name': 'label', 'type': '{0, 1}'}, labels)]
    assert len(labels) == len(text)
    '''
        Call functions to extract features and add to data.
    '''
    data += avg_sen_len(text)
    data += w2v_sim(text)
    data += ngram(text, './models/3.binlm', 'tri')
    data += ngram(text, './models/4.binlm', 'quad')

    #print pos_file
    #pos_labels = load_text(pos_file)
    #data += ngram(pos_labels, './models/pos3.binlm', 'pos-tri')
    #data += ngram(pos_labels, './models/pos4.binlm', 'pos-quad')
    '''
        Output the arff file.
    '''
    param = arff_dump(output_file, data, param=param)

    if len(sys.argv) <= 4:
        f = open('param.json', 'w')
        f.write(json.dumps(param, indent=2))
        f.close()
Beispiel #35
0
import ngram
import list_subtraction

a="paraparaparadise"
b="paragraph"

x=sorted(set(ngram.ngram(a,2)))
y=sorted(set(ngram.ngram(b,2)))
print("x:",x)
print("y:",y)

w=x+y

""" 関数ngramの内容

def ngram(text,n):
    #n字数ごとずらす
    x=[]
    for y in range(len(text)):
        
        if y==len(text)-n+1:
           break;
        x.append(text[y:y+n])
    return x
"""
f=sorted(set(w))

#set型は重複しない要素(同じ値ではない要素、ユニークな要素)のコレクション
#しかし元の順序とは異なる
s=sorted(set(list_subtraction.list_subtraction(w,f)))
c=sorted(set(list_subtraction.list_subtraction(x,s)))
Beispiel #36
0
 def test_ngram_twogram(self):
     import ngram
     cnts = ngram.ngram(n=2, k=1, document='test.txt')
     self.assertTrue(cnts.has_key("a a"), 'ngram does not contain appropriate keys' + str(cnts))
     self.assertEqual(cnts["a a"], 2, 'ngram count is incorrect' + str(cnts))
     self.assertEqual(len(cnts), 1, 'ngram should only contain top k counts' + str(cnts))