def index_shingles(shingles_file, n_bands, n_sigs, knn): """ Parses already extracted shingles from a file. File format is: relaltionship_type \t shingle1 shingle2 shingle3 ... shingle_n """ f_shingles = codecs.open(shingles_file, encoding='utf-8') relationships = [] print "Reading features file" for line in f_shingles: rel_id, rel_type, shingles = line.split('\t') shingles = shingles.strip().split(' ') relationships.append((rel_type, rel_id, shingles)) f_shingles.close() print "SIGS :", n_sigs print "BANDS :", n_bands lsh = LocalitySensitiveHashing(n_bands, n_sigs, knn) lsh.create() count = 0 elapsed_time = 0 for r in relationships: start_time = time.time() sigs = MinHash.signature(r[2], n_sigs) lsh.index(r[0], r[1], sigs) elapsed_time += time.time() - start_time count += 1 if count % 100 == 0: sys.stdout.write("Processed " + str(count) + " in %.2f seconds" % elapsed_time + "\n") sys.stdout.write("Total Indexing time: %.2f seconds" % elapsed_time + "\n")
def process_training_data(data_file, n_sigs, n_bands): print "Extracting features from training data and indexing in LSH\n" print "MinHash Signatures : ", n_sigs print "Bands : ", n_bands print lsh = LocalitySensitiveHashing(n_bands, n_sigs) lsh.create() # parallelization # read file into a queue # each process runs a FeatureExtractior manager = multiprocessing.Manager() queue = manager.Queue() print "\nLoading sentences from file" f_sentences = codecs.open(data_file, encoding='utf-8') count = 0 for line in f_sentences: if line.startswith("#") or line.startswith("\n"): continue count += 1 if count % 10000 == 0: sys.stdout.write(".") queue.put(line.strip()) f_sentences.close() print queue.qsize(), "sentences loaded" num_cpus = 12 relationships = [] pipes = [multiprocessing.Pipe(False) for _ in range(num_cpus)] processes = [multiprocessing.Process(target=extract_features, args=(queue, lsh, pipes[i][1])) for i in range(num_cpus)] print "\nIndexing relationship instances from sentences" print "Running", len(processes), " processes" start_time = time.time() for proc in processes: proc.start() for i in range(len(pipes)): data = pipes[i][0].recv() child_instances = data for x in child_instances: relationships.append(x) pipes[i][0].close() for proc in processes: proc.join() elapsed_time = time.time() - start_time sys.stdout.write("Processed " + str(count) + " in %.2f seconds" % elapsed_time+"\n") # write shingles to file f_features = open("features.txt", "w") for rel in relationships: f_features.write(str(rel[1])+'\t'+rel[0].decode("utf8")+'\t'+' '.join(rel[3])+'\n') f_features.close() return relationships
def main(): ########################### # CLASSIFY A NEW SENTENCE ########################### # argv[1] - true # argv[2] - bands file # argv[3] - dict (sigs->sentence_id) file # argv[4] - sentence to classify if sys.argv[1] == 'true': print "Loading PoS tagger" model = open('postagger/datasets/cintil-reduced-tagset.pkl', "rb") pos_tagger = pickle.load(model) model.close() print "Loading verbs conjugations" f_verbs = open('verbs/verbs_conj.pkl', "rb") verbs = pickle.load(f_verbs) f_verbs.close() extractor = FeatureExtractor(pos_tagger, verbs) lsh = LocalitySensitiveHashing(N_BANDS, N_SIGS, KNN, USE_REDIS) # read sentences from file # create a relationship object # extract features/shingles and calculate min-hash sigs classify_sentences(sys.argv[2], extractor, lsh) ############################ # INDEX TRAINING INSTANCES # ############################ # argv[1] - false # argv[2] - training data file (example: train_data.txt) elif sys.argv[1] == 'false': # calculate min-hash sigs (from already extracted shingles) index in bands if os.path.isfile("features.txt"): print "Calculating min-hash sigs from features.txt file" relationships = load_shingles('features.txt') print "\n" print "Indexing ", len(relationships), "relationships" print "MinHash Signatures: ", N_SIGS print "Bands : ", N_BANDS print "Sigs per Band : ", N_SIGS / N_BANDS lsh = LocalitySensitiveHashing(N_BANDS, N_SIGS, KNN, USE_REDIS) lsh.create() for r in relationships: lsh.index(r) # load sentences, extract features, calculate min-hash sigs, index in bands else: print "Loading PoS tagger" model = open('postagger/datasets/cintil-reduced-tagset.pkl', "rb") pos_tagger = pickle.load(model) model.close() print "Loading verbs conjugations" f_verbs = open('verbs/verbs_conj.pkl', "rb") verbs = pickle.load(f_verbs) f_verbs.close() extractor = FeatureExtractor(pos_tagger, verbs) #extractor = FeatureExtractor(None, None) print "Extracting features from training data and calculating min-hash sigs" relationships = load_training_relationships(sys.argv[2], extractor) print "\n" print "Indexing ", len(relationships), "relationships" print "MinHash Signatures: ", N_SIGS print "Bands : ", N_BANDS lsh = LocalitySensitiveHashing(N_BANDS, N_SIGS, KNN, USE_REDIS) lsh.create() for r in relationships: lsh.index(r)
def process_training_data(data_file, n_sigs, n_bands): print "Extracting features from training data and indexing in LSH\n" print "MinHash Signatures : ", n_sigs print "Bands : ", n_bands print lsh = LocalitySensitiveHashing(n_bands, n_sigs) lsh.create() # parallelization # read file into a queue # each process runs a FeatureExtractior manager = multiprocessing.Manager() queue = manager.Queue() print "\nLoading sentences from file" f_sentences = codecs.open(data_file, encoding='utf-8') count = 0 for line in f_sentences: if line.startswith("#") or line.startswith("\n"): continue count += 1 if count % 10000 == 0: sys.stdout.write(".") queue.put(line.strip()) f_sentences.close() print queue.qsize(), "sentences loaded" num_cpus = 12 relationships = [] pipes = [multiprocessing.Pipe(False) for _ in range(num_cpus)] processes = [ multiprocessing.Process(target=extract_features, args=(queue, lsh, pipes[i][1])) for i in range(num_cpus) ] print "\nIndexing relationship instances from sentences" print "Running", len(processes), " processes" start_time = time.time() for proc in processes: proc.start() for i in range(len(pipes)): data = pipes[i][0].recv() child_instances = data for x in child_instances: relationships.append(x) pipes[i][0].close() for proc in processes: proc.join() elapsed_time = time.time() - start_time sys.stdout.write("Processed " + str(count) + " in %.2f seconds" % elapsed_time + "\n") # write shingles to file f_features = open("features.txt", "w") for rel in relationships: f_features.write( str(rel[1]) + '\t' + rel[0].decode("utf8") + '\t' + ' '.join(rel[3]) + '\n') f_features.close() return relationships