Esempio n. 1
0
def main():
    config = getOptions()

    docwords = []
    for doc_filename in config['doc_filenames']:
        with open(doc_filename) as DOC:
            doc = DOC.readlines()
            doc = "".join(doc)

        wordsInSentences, wc = extractSentenceWords(doc, 2)
        print "%d words extracted from '%s'" % (wc, doc_filename)
        docwords.append(wordsInSentences)

    topicvec = topicvecDir(**config)
    topicvec.setDocs(docwords, config['doc_filenames'])

    if 'evalKmeans' in config and config['evalKmeans']:
        topicvec.kmeans()
        topicvec.printTopWordsInTopic(None, True)
        exit(0)

    best_last_Ts, Em, docs_Em, Pi = topicvec.inference()

    basename = os.path.basename(config['logfilename'])
    basetrunk = os.path.splitext(basename)[0]

    best_it, best_T, best_loglike = best_last_Ts[0]
    save_matrix_as_text(basetrunk + "-em%d-best.topic.vec" % best_it, "topic",
                        best_T)

    if best_last_Ts[1]:
        last_it, last_T, last_loglike = best_last_Ts[1]
        save_matrix_as_text(basetrunk + "-em%d-last.topic.vec" % last_it,
                            "topic", last_T)
Esempio n. 2
0
def main():
    config = getOptions()

    docwords = []
    for doc_filename in config['doc_filenames']:
        with open(doc_filename) as DOC:
            doc = DOC.readlines()
            doc = "".join(doc)
    
        wordsInSentences, wc = extractSentenceWords(doc, 2)
        print "%d words extracted from '%s'" %(wc, doc_filename)
        docwords.append(wordsInSentences)

    topicvec = topicvecDir(**config)
    topicvec.setDocs( docwords, config['doc_filenames'] )
    
    if 'evalKmeans' in config and config['evalKmeans']:
        topicvec.kmeans()
        topicvec.printTopWordsInTopic(None, True)
        exit(0)
        
    best_last_Ts, Em, docs_Em, Pi = topicvec.inference()

    basename = os.path.basename(config['logfilename'])
    basetrunk = os.path.splitext(basename)[0]

    best_it, best_T, best_loglike = best_last_Ts[0]
    save_matrix_as_text( basetrunk + "-em%d-best.topic.vec" %best_it, "topic", best_T  )

    if best_last_Ts[1]:
        last_it, last_T, last_loglike = best_last_Ts[1]
        save_matrix_as_text( basetrunk + "-em%d-last.topic.vec" %last_it, "topic", last_T  )
Esempio n. 3
0
def main():
    config = getOptions()

    docwords = []
    csvfiles_filecount = 0
    csvfiles_wc = 0
    csvfiles_rowcount = 0
    file_rownames = []
    for csv_filename in config['csv_filenames']:
        csvfile_wc = 0
        csvfile_rowcount = 0
        with open(csv_filename) as DOC:
            docreader = csv.reader(DOC)
            for row in docreader:
                doc = row[0]
                wordsInSentences, wc = extractSentenceWords(doc, min_length=2)
                csvfile_wc += wc
                csvfile_rowcount += 1
                docwords.append(wordsInSentences)
                file_rownames.append("%s-row%d" %
                                     (csv_filename, csvfile_rowcount))
        csvfile_avgwc = csvfile_wc * 1.0 / csvfile_rowcount
        print "%d words extracted from %d rows in '%s'. Avg %.1f words each row" % (
            csvfile_wc, csvfile_rowcount, csv_filename, csvfile_avgwc)

        csvfiles_wc += csvfile_wc
        csvfiles_rowcount += csvfile_rowcount
        csvfiles_filecount += 1
    csvfiles_avgwc = csvfiles_wc * 1.0 / csvfiles_rowcount
    if csvfiles_filecount > 1:
        print "%d words extracted from %d rows in %d csv files. Avg %.1f words each row" % (
            csvfiles_wc, csvfiles_rowcount, csvfiles_filecount, csvfiles_avgwc)

    topicvec = topicvecDir(**config)
    topicvec.setDocs(docwords, file_rownames)

    if 'evalKmeans' in config and config['evalKmeans']:
        topicvec.kmeans()
        topicvec.printTopWordsInTopic(None, True)
        exit(0)

    best_last_Ts, Em, docs_Em, Pi = topicvec.inference()

    basename = os.path.basename(config['logfilename'])
    basetrunk = os.path.splitext(basename)[0]

    best_it, best_T, best_loglike = best_last_Ts[0]
    save_matrix_as_text(basetrunk + "-em%d-best.topic.vec" % best_it, "topic",
                        best_T)

    if best_last_Ts[1]:
        last_it, last_T, last_loglike = best_last_Ts[1]
        save_matrix_as_text(basetrunk + "-em%d-last.topic.vec" % last_it,
                            "topic", last_T)
Esempio n. 4
0
def main():
    config = getOptions()

    docwords = []
    csvfiles_filecount = 0
    csvfiles_wc = 0
    csvfiles_rowcount = 0
    file_rownames = []
    for csv_filename in config['csv_filenames']:
        csvfile_wc = 0
        csvfile_rowcount = 0
        with open(csv_filename) as DOC:
            docreader = csv.reader(DOC)
            for row in docreader:
                doc = row[0] 
                wordsInSentences, wc = extractSentenceWords(doc, min_length=2)
                csvfile_wc += wc
                csvfile_rowcount += 1
                docwords.append(wordsInSentences)
                file_rownames.append( "%s-row%d" %(csv_filename, csvfile_rowcount) )
        csvfile_avgwc = csvfile_wc * 1.0 / csvfile_rowcount
        print "%d words extracted from %d rows in '%s'. Avg %.1f words each row" %( csvfile_wc, 
                    csvfile_rowcount, csv_filename, csvfile_avgwc )
                    
        csvfiles_wc += csvfile_wc
        csvfiles_rowcount += csvfile_rowcount
        csvfiles_filecount += 1
    csvfiles_avgwc = csvfiles_wc * 1.0 / csvfiles_rowcount
    if csvfiles_filecount > 1:
        print "%d words extracted from %d rows in %d csv files. Avg %.1f words each row" %(csvfiles_wc, 
                    csvfiles_rowcount, csvfiles_filecount, csvfiles_avgwc)
    
    topicvec = topicvecDir(**config)
    topicvec.setDocs( docwords, file_rownames )
    
    if 'evalKmeans' in config and config['evalKmeans']:
        topicvec.kmeans()
        topicvec.printTopWordsInTopic(None, True)
        exit(0)
        
    best_last_Ts, Em, docs_Em, Pi = topicvec.inference()

    basename = os.path.basename(config['logfilename'])
    basetrunk = os.path.splitext(basename)[0]

    best_it, best_T, best_loglike = best_last_Ts[0]
    save_matrix_as_text( basetrunk + "-em%d-best.topic.vec" %best_it, "topic", best_T  )

    if best_last_Ts[1]:
        last_it, last_T, last_loglike = best_last_Ts[1]
        save_matrix_as_text( basetrunk + "-em%d-last.topic.vec" %last_it, "topic", last_T  )
Esempio n. 5
0
def main():

    config = getOptions()
    snip_filename = config['snip_filename']
    snips_words = []
    snips_name = []

    with open(snip_filename) as DOC:
        snip_lines = []
        snipcount = 0
        snips_wc = 0
        for line in DOC:
            line = line.strip()
            if line:
                snip_lines.append(line)
            else:
                sniptext = " ".join(snip_lines)
                wordsInSentences, wc = extractSentenceWords(
                    sniptext, remove_punc="iso-8859-1")
                snips_wc += wc
                snipcount += 1
                snips_words.append(wordsInSentences)
                snips_name.append("%s-row%d" % (snip_filename, snipcount))

    snipfile_avgwc = snips_wc * 1.0 / snipcount
    print "%d words extracted from %d snippets in '%s'. Avg %.1f words each row" % (
        snips_wc, snipcount, snip_filename, snipfile_avgwc)

    topicvec = topicvecDir(**config)
    topicvec.setDocs(snips_words, snips_name)

    best_last_Ts, Em, docs_Em, Pi = topicvec.inference()

    basename = os.path.basename(config['logfilename'])
    basetrunk = os.path.splitext(basename)[0]

    best_it, best_T, best_loglike = best_last_Ts[0]
    save_matrix_as_text(basetrunk + "-em%d-best.topic.vec" % best_it, "topic",
                        best_T)

    if best_last_Ts[1]:
        last_it, last_T, last_loglike = best_last_Ts[1]
        save_matrix_as_text(basetrunk + "-em%d-last.topic.vec" % last_it,
                            "topic", last_T)
Esempio n. 6
0
def run_topicvec(config, docwords, file_rownames):
    """Runs the TopicVec pipeline. Adapted from the original open source code."""
    topicvec = topicvecDir(**config)
    topicvec.setDocs(docwords, file_rownames)
    best_last_Ts, Em, docs_Em, Pi = topicvec.inference()

    basename = os.path.basename(config['logfilename'])
    basetrunk = os.path.splitext(basename)[0]

    best_it, best_T, best_loglike = best_last_Ts[0]
    save_matrix_as_text(basetrunk + "-em%d-best.topic.vec" % best_it, "topic",
                        best_T)

    if best_last_Ts[1]:
        last_it, last_T, last_loglike = best_last_Ts[1]
        save_matrix_as_text(basetrunk + "-em%d-last.topic.vec" % last_it,
                            "topic", last_T)

    return topicvec
def main():

    config = getOptions()
    snip_filename = config['snip_filename']
    snips_words = []
    snips_name = []
    
    with open(snip_filename) as DOC:
        snip_lines = []
        snipcount = 0
        snips_wc = 0
        for line in DOC:
            line = line.strip()
            if line:
                snip_lines.append(line)
            else:
                sniptext = " ".join(snip_lines)
                wordsInSentences, wc = extractSentenceWords(sniptext, remove_punc="iso-8859-1")
                snips_wc += wc
                snipcount += 1
                snips_words.append(wordsInSentences)
                snips_name.append( "%s-row%d" %(snip_filename, snipcount) )
                
    snipfile_avgwc = snips_wc * 1.0 / snipcount
    print "%d words extracted from %d snippets in '%s'. Avg %.1f words each row" %( snips_wc, 
                snipcount, snip_filename, snipfile_avgwc )
    
    topicvec = topicvecDir(**config)
    topicvec.setDocs( snips_words, snips_name )
    
    best_last_Ts, Em, docs_Em, Pi = topicvec.inference()

    basename = os.path.basename(config['logfilename'])
    basetrunk = os.path.splitext(basename)[0]

    best_it, best_T, best_loglike = best_last_Ts[0]
    save_matrix_as_text( basetrunk + "-em%d-best.topic.vec" %best_it, "topic", best_T  )

    if best_last_Ts[1]:
        last_it, last_T, last_loglike = best_last_Ts[1]
        save_matrix_as_text( basetrunk + "-em%d-last.topic.vec" %last_it, "topic", last_T  )
Esempio n. 8
0
        ORIG.write("\n")
    ORIG.close()
    print "%d original docs saved in '%s'" % (setDocNum, orig_filename)

    if onlyGetOriginalText:
        continue

    if si == 0:
        if onlyInferTopicProp:
            topicfile_trunk = topic_vec_file.split(".")[0]
            topicTraits = topicfile_trunk.split("-")[3:]
            topicTraitStr = "-".join(topicTraits)
            T = load_matrix_from_text(topic_vec_file, "topic")
            config['K'] = T.shape[0]

        topicvec = topicvecDir(**config)
        out = topicvec.genOutputter(0)

    docs_idx = topicvec.setDocs(orig_docs_words, orig_docs_name)
    docs_name = [orig_docs_name[i] for i in docs_idx]
    docs_cat = [orig_docs_cat[i] for i in docs_idx]
    readDocNum = len(docs_idx)
    out("%d docs left after filtering empty docs" % (readDocNum))
    assert readDocNum == topicvec.D, "Returned %d doc idx != %d docs in Topicvec" % (
        readDocNum, topicvec.D)

    if onlyGetWidMapping:
        sorted_wids = sorted(topicvec.wid2freq.keys())
        uniq_wid_num = len(sorted_wids)
        for i, wid in enumerate(sorted_wids):
            wid2compactId[wid] = i
Esempio n. 9
0
        config['MAX_EM_ITERS'] = MAX_ITERS

loader = corpus2loader[corpusName]
wid2compactId = {}
compactId_words = []
hasIdMapping = False

if onlyInferTopicProp:
    topicfile_trunk = topic_vec_file.split(".")[0]
    topicTraits = topicfile_trunk.split("-")[3:]
    topicTraitStr = "-".join(topicTraits)
    T = load_matrix_from_text( topic_vec_file, "topic" )
    config['K'] = T.shape[0]

config['logfilename'] = corpusName
topicvec = topicvecDir(**config)
out = topicvec.genOutputter(0)

for si, subsetName in enumerate(subsetNames):       
    print "Process subset '%s':" %subsetName
    if subsetName == 'all-mapping':
        subsetName = 'all'
        onlyGetWidMapping = True
    else:
        onlyGetWidMapping = False
        
    subsetDocNum, orig_docs_words, orig_docs_name, orig_docs_cat, cats_docsWords, \
            cats_docNames, category_names = loader(subsetName)
    catNum = len(category_names)
    basename = "%s-%s-%d" %( corpusName, subsetName, subsetDocNum )