cleanFileName = './%s/%s_%sData_clean.txt' % (person,person,dataType)

f = open(dataSegFileName,'r')
l = open(cleanFileName,'a')
count = 0
while True:
    line = f.readline()
    count += 1
    if  line == '':
        break
    if  count <= 194833:
        continue
    line = line.split()[1:]
    for term in line:
        try:
            if  stop.stop3(term.split('/')[1]):
                continue
        except:
            print count
            print 'ERROR'
            print term
            #quit()
        term = term.split('/')[0]
        l.write(term + ' ')
    l.write('\n')
f.close()
l.close()

quit()
word2vec.doc2vec(cleanFileName, fileName, cbow=0, size=50, window=10, negative=5, hs=0, sample='1e-4', threads=12, iter_=20, min_count=1, verbose=True)
word2vec.word2phrase(cleanFileName, fileName, verbose=True)
def LDA_INIT():
    global vocab,word_dict
    global model
    global id_list
    global topic_num
    global vocab
    global idToTitle_dict
    global word_dict
    global docTermMatrix
       
    ### Construct Vocabulary List
    vocab = []
    w = open(vocabFileName,'r')
    while True:
        line = w.readline()
        if  line == '':
            break
        
        try:
            line.split()[1]
        except:
            continue
        if  int(line.split()[1]) < termFreqLowBound:
            break
        line = line.split()[0]
        word_dict[line] = 0
    w.close()
    
    vocab = word_dict.keys()
    #############################
    ### Construct Title List
    w = open(cleanFileName,'r')
    while True:
        line = w.readline()
        if  line == '':
            break
        
        try:
            idToTitle_dict.append(line)
        except:
            continue
    w.close()
    #############################
    ### Construct Doc-Term Matrix
    f = open(dataSegFileName,'r')
    count = 0
    
    while not loadJson:
        line = f.readline()
        count += 1
        print count
        if  line == '':
            break
        
        ID = line.split()[0]
        id_list.append(ID)
        line = line.split()[1:]  
        reset_word_dict(word_dict)
        for term in line:
            termAttribute = term.split('/')[-1]
            if  stop.stop3(termAttribute):
                continue
            
            try:
                #print term
                term = term.split('/')[0]
                if  word_dict.has_key(term):
                    word_dict[term] += 1
                #else:
                    #print 'ERROR: word_dict does not have term'
            except:
                print 'ERROR: term error'
                print term
        
        vec = word_dict.values()
        docTermMatrix.append(vec)
    
    f.close()
    if  loadJson:
        jsonFileName = './%s/%s_LDA_Doc_Term_Matrix_%s.json' % (person,person,dataType)
        docTermMatrix = json.load(open(jsonFileName,'r'))
    
    if  writeJson:
        jsonFileName = './%s/%s_LDA_Doc_Term_Matrix_%s.json' % (person,person,dataType)
        json.dump(docTermMatrix, open(jsonFileName,'w'))
    #############################
    print 'LDA_INIT DONE!!!'
Ejemplo n.º 3
0
while True:
    line = f.readline()
    count += 1
    if  line == '':
        break
    line = line.split('\t')
    
    try:
        sentence = line[1].split()
    except:
        print count,sentence
        quit()
       
    for i in range(len(sentence)):
        termAttr = sentence[i].split('/')[-1]
        if  stop.stop3(termAttr):
            continue
        bigram = ''
        if  i is len(sentence)-1:
            word = sentence[i].split('/')[0]
            addVocab(word)
            vocab[word] += 1
        else:
            word = sentence[i].split('/')[0]
            
            try:
                if  bigram and i+1 != len(sentence) and not stop.stop3(sentence[i+1].split('/')[1]):
                    bigram = word + sentence[i+1].split('/')[0]
                    addVocab(bigram)
                    vocab[bigram] += 1
            except: