def constructMetaData(dataPath, fileList): for xmlFile in fileList: debug(dataPath + xmlFile) doc = minidom.parse(dataPath + xmlFile) threads = doc.getElementsByTagName("Thread") for thread in threads: relQ = thread.getElementsByTagName('RelQuestion')[0] Qid = relQ.getAttribute('RELQ_ID') meta_cache[Qid] = { 'author': relQ.getAttribute('RELQ_USERID'), 'category': relQ.getAttribute('RELQ_CATEGORY'), 'time': relQ.getAttribute('RELQ_DATE') } if meta_cache[Qid]['category'] not in unique_cats: unique_cats.append(meta_cache[Qid]['category']) user_tracker = {} for relC in thread.getElementsByTagName('RelComment'): Cid = relC.getAttribute('RELC_ID') meta_cache[Cid] = { 'author': relC.getAttribute('RELC_USERID'), 'time': relC.getAttribute('RELC_DATE') } if meta_cache[Cid]['author'] not in user_tracker: user_tracker[meta_cache[Cid]['author']] = 0 user_tracker[meta_cache[Cid]['author']] += 1 meta_cache[Cid]['comment#'] = user_tracker[meta_cache[Cid] ['author']] for relC in thread.getElementsByTagName('RelComment'): Cid = relC.getAttribute('RELC_ID') meta_cache[Cid]['#comment'] = user_tracker[meta_cache[Cid] ['author']]
from myutils import debug # gensim modules from gensim.models import Doc2Vec # KMeans clustering from sklearn.cluster import KMeans # Model persistence from sklearn.externals import joblib config = json.load(open('config.json', 'r')) cluster_cache = {} debug('====== IMPORTING DOC2VEC MODEL ======') modelPath = config['DOC2VEC']['full']['path'] modelName = config['DOC2VEC']['full']['name'] doc2vec = Doc2Vec.load(modelPath + modelName) debug('====== CONSTRUCTING DATA POINTS ======') vocab = doc2vec.vocab.keys() X = np.array([doc2vec[w] for w in vocab]) X.dtype = np.float64 debug('====== RUNNING KMEANS ======') kmeans = KMeans(n_clusters=1000).fit(X) joblib.dump(kmeans, 'models/cluster/kmeans.pkl') debug('====== SAVING RESULTS ======') for i, w in enumerate(vocab):
global tagger, tagger_cache if tagger_cache.get(id) is not None: return tags = tagger.tag(wl) findUniqueTags(tags) tagger_cache[id] = tagsToString(tags) def POSTag(data): for q, cl in data: q_w = preprocessor(q[1]) addToCache(q[0], q_w) for c in cl: c_w = preprocessor(c[1]) addToCache(c[0], c_w) debug('======= TRAIN DATA =======') dataPath = config['TRAIN_NN']['path'] fileList = config['TRAIN_NN']['files'] data = constructData(dataPath, fileList) POSTag(data) debug('======= TEST DATA \'16 =======') dataPath = config['TEST_NN']['path'] fileList = config['TEST_NN']['2016']['files'] data = constructData(dataPath, fileList) POSTag(data) debug('======= TEST DATA \'17 =======') dataPath = config['TEST_NN']['path'] fileList = config['TEST_NN']['2017']['files'] data = constructData(dataPath, fileList)
# gensim corpus lib from gensim import corpora # LDA topic modeling lib from gensim.models.ldamodel import LdaModel # pre-processing utilities from myutils import preprocessor, constructData, debug config = json.load(open('config.json', 'r')) dataPath = config['TRAIN_NN']['path'] fileList = config['TRAIN_NN']['files'] data = constructData(dataPath, fileList) debug('====== CONSTRUCTING DOCS AND TEXTS ======') docs = [] for q, cl in data: docs.append(q[1]) for c in cl: docs.append(c[1]) texts = [preprocessor(d) for d in docs] debug('====== CONSTRUCTING DICTIONARY ======') dictionary = corpora.Dictionary(texts) dictionary.save('models/lda/semeval.dict') debug('====== CONSTRUCTING CORPUS ======') corpus = [dictionary.doc2bow(text) for text in texts] corpora.MmCorpus.serialize('models/lda/semeval.mm', corpus)
meta_cache[Cid] = { 'author': relC.getAttribute('RELC_USERID'), 'time': relC.getAttribute('RELC_DATE') } if meta_cache[Cid]['author'] not in user_tracker: user_tracker[meta_cache[Cid]['author']] = 0 user_tracker[meta_cache[Cid]['author']] += 1 meta_cache[Cid]['comment#'] = user_tracker[meta_cache[Cid] ['author']] for relC in thread.getElementsByTagName('RelComment'): Cid = relC.getAttribute('RELC_ID') meta_cache[Cid]['#comment'] = user_tracker[meta_cache[Cid] ['author']] debug('======= TRAIN DATA =======') dataPath = config['TRAIN_NN']['path'] fileList = config['TRAIN_NN']['files'] constructMetaData(dataPath, fileList) debug('======= TEST DATA \'16 =======') dataPath = config['TEST_NN']['path'] fileList = config['TEST_NN']['2016']['files'] constructMetaData(dataPath, fileList) debug('======= TEST DATA \'17 =======') dataPath = config['TEST_NN']['path'] fileList = config['TEST_NN']['2017']['files'] constructMetaData(dataPath, fileList) json.dump(meta_cache, open('meta_cache.json', 'w'))
for i in range(len(scores)): scores[i][2] = i + 1 scores = sorted(scores, key=lambda score: score[1]) for score in scores: out.write('\t'.join([ q[0], cl[score[1]][0], str(score[2]), str(score[0]), score[3] ])) out.write('\n') out.close() if __name__ == '__main__': populateParam() debug('== IMPORT DOC2VEC MODEL ==') doc2vec = loadDoc2Vec('full') """ TRAIN MODE """ debug('======= TRAIN MODE =======') dataPath = config['TRAIN_NN']['path'] fileList = config['TRAIN_NN']['files'] data = constructData(dataPath, fileList) \ if not os.path.isfile('out/trainNN.npz') else None mlp = trainNN(doc2vec, data) """ VALIDATION MODE """ debug('======= VALIDATION =======') dataPath = config['VALIDATION']['path'] fileList = config['VALIDATION']['files'] data = constructData(dataPath, fileList) output = dataPath + config['VALIDATION']['predictions'] predict(doc2vec, data, output, mlp)