def run(): num_hashes = 1000000 out = open("stats.txt", "w") #conn_str = "dbname=abe2 user=yuewang" conn_str = "dbname=postgres" script_counts = Counts(70) with psycopg2.connect(conn_str, cursor_factory=NamedTupleCursor) as conn: from Hashes import fetch_hashes hashes = fetch_hashes(num_hashes, conn) for i,hash in enumerate(hashes): abe_txn = Txn(hash, conn) script_counts.add('txn_count') for txin in abe_txn.tx_in: script_counts.add(decode_script.decode_script(txin.script_bytea)) for txout in abe_txn.tx_out: script_counts.add(decode_script.decode_script(txout.script_bytea)) if i > 0 and i % 10000 == 0: out.write(script_counts.mkString()) out.flush() out.write(script_counts.mkString()) out.close()
langLines = f.readlines() f.close() print "Read file" #CLEAN AND FACTORIZE THE LINES cleanLines = [] for line, langLine, soundLine in zip(lines, langLines, soundexLines): cleanedLine = cleanLine(line) cleanedLangLine = cleanLine(langLine) cleanedSoundLine = cleanLine(soundLine) cleanLines.append(factorize(cleanedLine, cleanedLangLine, cleanedSoundLine)) print "cleaned lines" #GET THE COUNTS FROM THE SENTENCES mc = Counts() mcLang = Counts() mcSoundex = Counts() for line in cleanLines: for token in line: mc.incrementWord(token.getFactor(WORD_FACTOR)) #mcLang.incrementWord(token.getFactor(LANGAUGE_FACTOR)) #mcSoundex.incrementWord(token.getFactor(SOUNDEX_FACTOR)) mc.fixRanks() mcLang.fixRanks() mcSoundex.fixRanks() print "Counted the words" print "Number of words in the dataset: ", len(mc) print "Number of languages in the dataset: ", len(mcLang) print "Number of sounds in the dataset: ", len(mcSoundex)