def index_shingles(shingles_file, n_bands, n_sigs, knn):
    """
    Parses already extracted shingles from a file.
    File format is: relaltionship_type \t shingle1 shingle2 shingle3 ... shingle_n
    """
    f_shingles = codecs.open(shingles_file, encoding='utf-8')
    relationships = []
    print "Reading features file"
    for line in f_shingles:
        rel_id, rel_type, shingles = line.split('\t')
        shingles = shingles.strip().split(' ')
        relationships.append((rel_type, rel_id, shingles))
    f_shingles.close()

    print "SIGS  :", n_sigs
    print "BANDS :", n_bands

    lsh = LocalitySensitiveHashing(n_bands, n_sigs, knn)
    lsh.create()
    count = 0
    elapsed_time = 0

    for r in relationships:
        start_time = time.time()
        sigs = MinHash.signature(r[2], n_sigs)
        lsh.index(r[0], r[1], sigs)
        elapsed_time += time.time() - start_time
        count += 1
        if count % 100 == 0:
            sys.stdout.write("Processed " + str(count) + " in %.2f seconds" % elapsed_time + "\n")

    sys.stdout.write("Total Indexing time: %.2f seconds" % elapsed_time + "\n")
def process_training_data(data_file, n_sigs, n_bands):
    print "Extracting features from training data and indexing in LSH\n"
    print "MinHash Signatures : ", n_sigs
    print "Bands              : ", n_bands
    print
    lsh = LocalitySensitiveHashing(n_bands, n_sigs)
    lsh.create()

    # parallelization
    # read file into a queue
    # each process runs a FeatureExtractior
    manager = multiprocessing.Manager()
    queue = manager.Queue()
    print "\nLoading sentences from file"
    f_sentences = codecs.open(data_file, encoding='utf-8')
    count = 0
    for line in f_sentences:
        if line.startswith("#") or line.startswith("\n"):
            continue
        count += 1
        if count % 10000 == 0:
            sys.stdout.write(".")
        queue.put(line.strip())
    f_sentences.close()
    print queue.qsize(), "sentences loaded"

    num_cpus = 12
    relationships = []
    pipes = [multiprocessing.Pipe(False) for _ in range(num_cpus)]
    processes = [multiprocessing.Process(target=extract_features, args=(queue, lsh, pipes[i][1])) for i in range(num_cpus)]

    print "\nIndexing relationship instances from sentences"
    print "Running", len(processes), " processes"
    start_time = time.time()
    for proc in processes:
        proc.start()

    for i in range(len(pipes)):
        data = pipes[i][0].recv()
        child_instances = data
        for x in child_instances:
            relationships.append(x)
        pipes[i][0].close()

    for proc in processes:
        proc.join()

    elapsed_time = time.time() - start_time
    sys.stdout.write("Processed " + str(count) + " in %.2f seconds" % elapsed_time+"\n")

    # write shingles to file
    f_features = open("features.txt", "w")
    for rel in relationships:
        f_features.write(str(rel[1])+'\t'+rel[0].decode("utf8")+'\t'+' '.join(rel[3])+'\n')
    f_features.close()

    return relationships
Esempio n. 3
0
def main():

    ###########################
    # CLASSIFY A NEW SENTENCE
    ###########################
    # argv[1] - classify
    # argv[2] - sentence to classify

    n_bands = int(sys.argv[3])
    n_sigs = int(sys.argv[4])
    knn = int(sys.argv[5])

    if sys.argv[1] == 'classify':
        knn = int(sys.argv[5])
        lsh = LocalitySensitiveHashing(n_bands, n_sigs, knn)
        classify_sentences(sys.argv[2], lsh)

    ####################################################################
    # CLASSIFY A NEW SENTENCE WHERE THE SHINLGES WHERE ALREADY EXTRACTED
    ####################################################################
    # argv[1] - classify2
    # argv[2] - sentence to classify

    elif sys.argv[1] == 'classify2':
        lsh = LocalitySensitiveHashing(n_bands, n_sigs, knn)
        classify_sentences2(sys.argv[2], lsh, n_sigs)

    ############################
    # INDEX TRAINING INSTANCES #
    ############################
    # argv[1] - index
    # argv[2] - file with training sentences

    elif sys.argv[1] == 'index':
        # calculate min-hash sigs (from already extracted shingles) index in bands
        if os.path.isfile("features.txt"):
            index_shingles('features.txt', n_bands, n_sigs, knn)

        # load sentences, extract features, calculate min-hash sigs, index in bands
        else:
            process_training_data(sys.argv[2], n_sigs, n_bands)
Esempio n. 4
0
def index_shingles(shingles_file, n_bands, n_sigs, knn):
    """
    Parses already extracted shingles from a file.
    File format is: relaltionship_type \t shingle1 shingle2 shingle3 ... shingle_n
    """
    f_shingles = codecs.open(shingles_file, encoding='utf-8')
    relationships = []
    print "Reading features file"
    for line in f_shingles:
        rel_id, rel_type, shingles = line.split('\t')
        shingles = shingles.strip().split(' ')
        relationships.append((rel_type, rel_id, shingles))
    f_shingles.close()

    print "SIGS  :", n_sigs
    print "BANDS :", n_bands

    lsh = LocalitySensitiveHashing(n_bands, n_sigs, knn)
    lsh.create()
    count = 0
    elapsed_time = 0

    for r in relationships:
        start_time = time.time()
        sigs = MinHash.signature(r[2], n_sigs)
        lsh.index(r[0], r[1], sigs)
        elapsed_time += time.time() - start_time
        count += 1
        if count % 100 == 0:
            sys.stdout.write("Processed " + str(count) +
                             " in %.2f seconds" % elapsed_time + "\n")

    sys.stdout.write("Total Indexing time: %.2f seconds" % elapsed_time + "\n")
def main():

    ###########################
    # CLASSIFY A NEW SENTENCE
    ###########################
    # argv[1] - true
    # argv[2] - bands file
    # argv[3] - dict (sigs->sentence_id) file
    # argv[4] - sentence to classify

    if sys.argv[1] == 'true':

        print "Loading PoS tagger"
        model = open('postagger/datasets/cintil-reduced-tagset.pkl', "rb")
        pos_tagger = pickle.load(model)
        model.close()

        print "Loading verbs conjugations"
        f_verbs = open('verbs/verbs_conj.pkl', "rb")
        verbs = pickle.load(f_verbs)
        f_verbs.close()

        extractor = FeatureExtractor(pos_tagger, verbs)
        lsh = LocalitySensitiveHashing(N_BANDS, N_SIGS, KNN, USE_REDIS)
        
        # read sentences from file
        # create a relationship object
        # extract features/shingles and calculate min-hash sigs
        classify_sentences(sys.argv[2], extractor, lsh)

    ############################
    # INDEX TRAINING INSTANCES #
    ############################
    # argv[1] - false
    # argv[2] - training data file (example: train_data.txt)

    elif sys.argv[1] == 'false':
        # calculate min-hash sigs (from already extracted shingles) index in bands
        if os.path.isfile("features.txt"):
            print "Calculating min-hash sigs from features.txt file"
            relationships = load_shingles('features.txt')
            print "\n"
            print "Indexing ", len(relationships), "relationships"
            print "MinHash Signatures: ", N_SIGS
            print "Bands             : ", N_BANDS
            print "Sigs per Band     : ", N_SIGS / N_BANDS
            lsh = LocalitySensitiveHashing(N_BANDS, N_SIGS, KNN, USE_REDIS)
            lsh.create()
            for r in relationships:
                lsh.index(r)

        # load sentences, extract features, calculate min-hash sigs, index in bands
        else:
            print "Loading PoS tagger"
            model = open('postagger/datasets/cintil-reduced-tagset.pkl', "rb")
            pos_tagger = pickle.load(model)
            model.close()

            print "Loading verbs conjugations"
            f_verbs = open('verbs/verbs_conj.pkl', "rb")
            verbs = pickle.load(f_verbs)
            f_verbs.close()

            extractor = FeatureExtractor(pos_tagger, verbs)
            #extractor = FeatureExtractor(None, None)
            print "Extracting features from training data and calculating min-hash sigs"
            relationships = load_training_relationships(sys.argv[2], extractor)
            print "\n"
            print "Indexing ", len(relationships), "relationships"
            print "MinHash Signatures: ", N_SIGS
            print "Bands             : ", N_BANDS
            lsh = LocalitySensitiveHashing(N_BANDS, N_SIGS, KNN, USE_REDIS)
            lsh.create()
            for r in relationships:
                lsh.index(r)
Esempio n. 6
0
def process_training_data(data_file, n_sigs, n_bands):
    print "Extracting features from training data and indexing in LSH\n"
    print "MinHash Signatures : ", n_sigs
    print "Bands              : ", n_bands
    print
    lsh = LocalitySensitiveHashing(n_bands, n_sigs)
    lsh.create()

    # parallelization
    # read file into a queue
    # each process runs a FeatureExtractior
    manager = multiprocessing.Manager()
    queue = manager.Queue()
    print "\nLoading sentences from file"
    f_sentences = codecs.open(data_file, encoding='utf-8')
    count = 0
    for line in f_sentences:
        if line.startswith("#") or line.startswith("\n"):
            continue
        count += 1
        if count % 10000 == 0:
            sys.stdout.write(".")
        queue.put(line.strip())
    f_sentences.close()
    print queue.qsize(), "sentences loaded"

    num_cpus = 12
    relationships = []
    pipes = [multiprocessing.Pipe(False) for _ in range(num_cpus)]
    processes = [
        multiprocessing.Process(target=extract_features,
                                args=(queue, lsh, pipes[i][1]))
        for i in range(num_cpus)
    ]

    print "\nIndexing relationship instances from sentences"
    print "Running", len(processes), " processes"
    start_time = time.time()
    for proc in processes:
        proc.start()

    for i in range(len(pipes)):
        data = pipes[i][0].recv()
        child_instances = data
        for x in child_instances:
            relationships.append(x)
        pipes[i][0].close()

    for proc in processes:
        proc.join()

    elapsed_time = time.time() - start_time
    sys.stdout.write("Processed " + str(count) +
                     " in %.2f seconds" % elapsed_time + "\n")

    # write shingles to file
    f_features = open("features.txt", "w")
    for rel in relationships:
        f_features.write(
            str(rel[1]) + '\t' + rel[0].decode("utf8") + '\t' +
            ' '.join(rel[3]) + '\n')
    f_features.close()

    return relationships
Esempio n. 7
0
start_time = time.time()
d1 = 10
d2 = 80
P1 = 0.99
P2 = 0.01
p1 = (180 - d1) / 180
p2 = (180 - d2) / 180


# estimate b and r
def equations(p):
    r, b = p
    return (1 - P1 - (1 - p1**r)**b, 1 - P2 - (1 - p2**r)**b)


r1, b1 = fsolve(equations, (1, 1))
r1, b1 = int(r1), int(b1)
print("r = ", r1, ", b = ", b1)

lsh = LocalitySensitiveHashing(
    datafile="data_for_lsh.csv",
    dim=d,
    r=r1,
    b=b1,
)
lsh.get_data_from_csv()
lsh.initialize_hash_store()
lsh.hash_all_data()
print("--- %s seconds ---" % (time.time() - start_time))
similarity_neighborhoods = lsh.lsh_basic_for_nearest_neighbors()