txtFilename = '../Data/mc500.train.txt' txtFilename2 = '../Data/mc160.train.txt' stopWordFile = '../Data/stopwords.txt' #'../Data/mc160.train.txt' #"../Data/mc500.test.txt" #"../Data/mc160.test.txt" #"../Data/mc500.val.txt" #"../Data/mc160.val.txt" dataPickle_name = "../Pickle/" + "mc500+mc160.train" + ".lstm.noStopWord.x24.pickle" print dataPickle_name data = [] ans = parseAnsOneHot(ansFilename) print "Loading", txtFilename.split('/')[-1], "..." txtList = readTXT(txtFilename) stopWord = [] f = open(stopWordFile, 'r') for line in f: stopWord.append(line.split()[0]) idxCounter = 0 print "Loading word2vec..." word_vec = Word2Vec.load_word2vec_format(wordvec_file, binary=False) for one in txtList: #print 'The shape of one before is '+str(np.shape(one)) oneQ = [] for entry in one: tempList = [] temp_vector = np.zeros(300, dtype='float32') for word in entry: word = word.lower()
stopWordFile = '../Data/stopwords.txt' #'../Data/mc160.train.txt' #"../Data/mc500.test.txt" #"../Data/mc160.test.txt" #"../Data/mc500.val.txt" #"../Data/mc160.val.txt" dataPickle_name = "../Pickle/"+txtFilename.split('/')[-1].split('.')[0]+"."+txtFilename.split('/')[-1].split('.')[1]+".mod1.pickle" print dataPickle_name print "Loading word2vec..." word_vec = Word2Vec.load_word2vec_format(wordvec_file, binary=False) data = [] ans = parseAnsOneHot(ansFilename) print "Loading",txtFilename.split('/')[-1],"..." txtList = readTXT(txtFilename) stopWord = [] f = open(stopWordFile, 'r') for line in f: stopWord.append(line.split()[0]) pdb.set_trace() idxCounter = 0 for one in txtList: #print 'The shape of one before is '+str(np.shape(one)) oneQ = [] for entry in one: count = 0. temp_vector = np.zeros(300,dtype='float32') for word in entry: word = word.lower() if word in stopWord:
#'../Data/mc160.train.txt' #"../Data/mc500.test.txt" #"../Data/mc160.test.txt" #"../Data/mc500.val.txt" #"../Data/mc160.val.txt" dataPickle_name = "../Pickle/" + txtFilename.split('/')[-1].split( '.')[0] + "." + txtFilename.split('/')[-1].split('.')[1] + ".mod1.pickle" print dataPickle_name print "Loading word2vec..." word_vec = Word2Vec.load_word2vec_format(wordvec_file, binary=False) data = [] ans = parseAnsOneHot(ansFilename) print "Loading", txtFilename.split('/')[-1], "..." txtList = readTXT(txtFilename) stopWord = [] f = open(stopWordFile, 'r') for line in f: stopWord.append(line.split()[0]) pdb.set_trace() idxCounter = 0 for one in txtList: #print 'The shape of one before is '+str(np.shape(one)) oneQ = [] for entry in one: count = 0. temp_vector = np.zeros(300, dtype='float32') for word in entry: word = word.lower() if word in stopWord:
from cut1 import readTXT import sys import numpy as np import collections a = readTXT('../Data/mc500.train.txt') count = collections.Counter() maxlen = 0 for i, aaa in enumerate(a): if i % 4 == 0: count.update(aaa[0]) if maxlen < len(aaa[0]): maxlen = len(aaa[0]) for j in range(1, len(aaa)): count.update(aaa[j])
exit(1) input_ = argv[1] output = argv[2] dictionary = argv[3] OOV = 1 d_arr = {} with open(dictionary, 'r') as d: word_id = 1 for line in d: word = line.rstrip().lower() d_arr[word] = word_id word_id += 1 txt = readTXT(input_) #with open(output, 'w') as fout: for q_id in range(len(txt)): print q_id ''' for oneQ in txt: fout = open(output+str(q_id),"w") for word in oneQ[0]: word = word.lower() if word in d_arr: token = d_arr[word] elif word.split(".")[0] in d_arr: token = d_arr[word.split(".")[0]] elif word.split("'s")[0] in d_arr: token = d_arr[word.split("'s")[0]] elif word.split(":")[0] in d_arr:
txtFilename='../Data/mc500.train.txt' txtFilename2='../Data/mc160.train.txt' stopWordFile = '../Data/stopwords.txt' #'../Data/mc160.train.txt' #"../Data/mc500.test.txt" #"../Data/mc160.test.txt" #"../Data/mc500.val.txt" #"../Data/mc160.val.txt" dataPickle_name = "../Pickle/"+"mc500+mc160.train"+".lstm.noStopWord.x24.pickle" print dataPickle_name data = [] ans = parseAnsOneHot(ansFilename) print "Loading",txtFilename.split('/')[-1],"..." txtList = readTXT(txtFilename) stopWord = [] f = open(stopWordFile, 'r') for line in f: stopWord.append(line.split()[0]) idxCounter = 0 print "Loading word2vec..." word_vec = Word2Vec.load_word2vec_format(wordvec_file, binary=False) for one in txtList: #print 'The shape of one before is '+str(np.shape(one)) oneQ = [] for entry in one: tempList = [] temp_vector = np.zeros(300,dtype='float32') for word in entry: word = word.lower()
from cut1 import readTXT import sys import numpy as np import collections a = readTXT('../Data/mc500.train.txt') count = collections.Counter() maxlen = 0 for i,aaa in enumerate(a): if i%4 == 0: count.update(aaa[0]) if maxlen < len(aaa[0]): maxlen = len(aaa[0]) for j in range(1,len(aaa)): count.update(aaa[j])