Python LocalitySensitiveHashing.createの例

プログラミング言語: Python

名前空間/パッケージ名: LocalitySensitiveHashing

メソッド/関数: create

hotexamples.comのコード掲載数: 5

Python LocalitySensitiveHashing.create - 5件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのLocalitySensitiveHashing.LocalitySensitiveHashing.createの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

LocalitySensitiveHashing(4)

create(2)

get_data_from_csv(1)

hash_all_data(1)

index(1)

initialize_hash_store(1)

lsh_basic_for_nearest_neighbors(1)

コード例 #1

ファイルを表示

ファイル: MinHashClassifier.py プロジェクト: davidsbatista/TREMoSSo

def index_shingles(shingles_file, n_bands, n_sigs, knn):
    """
    Parses already extracted shingles from a file.
    File format is: relaltionship_type \t shingle1 shingle2 shingle3 ... shingle_n
    """
    f_shingles = codecs.open(shingles_file, encoding='utf-8')
    relationships = []
    print "Reading features file"
    for line in f_shingles:
        rel_id, rel_type, shingles = line.split('\t')
        shingles = shingles.strip().split(' ')
        relationships.append((rel_type, rel_id, shingles))
    f_shingles.close()

    print "SIGS  :", n_sigs
    print "BANDS :", n_bands

    lsh = LocalitySensitiveHashing(n_bands, n_sigs, knn)
    lsh.create()
    count = 0
    elapsed_time = 0

    for r in relationships:
        start_time = time.time()
        sigs = MinHash.signature(r[2], n_sigs)
        lsh.index(r[0], r[1], sigs)
        elapsed_time += time.time() - start_time
        count += 1
        if count % 100 == 0:
            sys.stdout.write("Processed " + str(count) + " in %.2f seconds" % elapsed_time + "\n")

    sys.stdout.write("Total Indexing time: %.2f seconds" % elapsed_time + "\n")

コード例 #2

ファイルを表示

def index_shingles(shingles_file, n_bands, n_sigs, knn):
    """
    Parses already extracted shingles from a file.
    File format is: relaltionship_type \t shingle1 shingle2 shingle3 ... shingle_n
    """
    f_shingles = codecs.open(shingles_file, encoding='utf-8')
    relationships = []
    print "Reading features file"
    for line in f_shingles:
        rel_id, rel_type, shingles = line.split('\t')
        shingles = shingles.strip().split(' ')
        relationships.append((rel_type, rel_id, shingles))
    f_shingles.close()

    print "SIGS  :", n_sigs
    print "BANDS :", n_bands

    lsh = LocalitySensitiveHashing(n_bands, n_sigs, knn)
    lsh.create()
    count = 0
    elapsed_time = 0

    for r in relationships:
        start_time = time.time()
        sigs = MinHash.signature(r[2], n_sigs)
        lsh.index(r[0], r[1], sigs)
        elapsed_time += time.time() - start_time
        count += 1
        if count % 100 == 0:
            sys.stdout.write("Processed " + str(count) +
                             " in %.2f seconds" % elapsed_time + "\n")

    sys.stdout.write("Total Indexing time: %.2f seconds" % elapsed_time + "\n")

コード例 #3

ファイルを表示

ファイル: MinHashClassifier.py プロジェクト: davidsbatista/TREMoSSo

def process_training_data(data_file, n_sigs, n_bands):
    print "Extracting features from training data and indexing in LSH\n"
    print "MinHash Signatures : ", n_sigs
    print "Bands              : ", n_bands
    print
    lsh = LocalitySensitiveHashing(n_bands, n_sigs)
    lsh.create()

    # parallelization
    # read file into a queue
    # each process runs a FeatureExtractior
    manager = multiprocessing.Manager()
    queue = manager.Queue()
    print "\nLoading sentences from file"
    f_sentences = codecs.open(data_file, encoding='utf-8')
    count = 0
    for line in f_sentences:
        if line.startswith("#") or line.startswith("\n"):
            continue
        count += 1
        if count % 10000 == 0:
            sys.stdout.write(".")
        queue.put(line.strip())
    f_sentences.close()
    print queue.qsize(), "sentences loaded"

    num_cpus = 12
    relationships = []
    pipes = [multiprocessing.Pipe(False) for _ in range(num_cpus)]
    processes = [multiprocessing.Process(target=extract_features, args=(queue, lsh, pipes[i][1])) for i in range(num_cpus)]

    print "\nIndexing relationship instances from sentences"
    print "Running", len(processes), " processes"
    start_time = time.time()
    for proc in processes:
        proc.start()

    for i in range(len(pipes)):
        data = pipes[i][0].recv()
        child_instances = data
        for x in child_instances:
            relationships.append(x)
        pipes[i][0].close()

    for proc in processes:
        proc.join()

    elapsed_time = time.time() - start_time
    sys.stdout.write("Processed " + str(count) + " in %.2f seconds" % elapsed_time+"\n")

    # write shingles to file
    f_features = open("features.txt", "w")
    for rel in relationships:
        f_features.write(str(rel[1])+'\t'+rel[0].decode("utf8")+'\t'+' '.join(rel[3])+'\n')
    f_features.close()

    return relationships

コード例 #4

ファイルを表示

ファイル: MinHashClassifier.py プロジェクト: davidsbatista/minhash-classifier

def main():

    ###########################
    # CLASSIFY A NEW SENTENCE
    ###########################
    # argv[1] - true
    # argv[2] - bands file
    # argv[3] - dict (sigs->sentence_id) file
    # argv[4] - sentence to classify

    if sys.argv[1] == 'true':

        print "Loading PoS tagger"
        model = open('postagger/datasets/cintil-reduced-tagset.pkl', "rb")
        pos_tagger = pickle.load(model)
        model.close()

        print "Loading verbs conjugations"
        f_verbs = open('verbs/verbs_conj.pkl', "rb")
        verbs = pickle.load(f_verbs)
        f_verbs.close()

        extractor = FeatureExtractor(pos_tagger, verbs)
        lsh = LocalitySensitiveHashing(N_BANDS, N_SIGS, KNN, USE_REDIS)
        
        # read sentences from file
        # create a relationship object
        # extract features/shingles and calculate min-hash sigs
        classify_sentences(sys.argv[2], extractor, lsh)

    ############################
    # INDEX TRAINING INSTANCES #
    ############################
    # argv[1] - false
    # argv[2] - training data file (example: train_data.txt)

    elif sys.argv[1] == 'false':
        # calculate min-hash sigs (from already extracted shingles) index in bands
        if os.path.isfile("features.txt"):
            print "Calculating min-hash sigs from features.txt file"
            relationships = load_shingles('features.txt')
            print "\n"
            print "Indexing ", len(relationships), "relationships"
            print "MinHash Signatures: ", N_SIGS
            print "Bands             : ", N_BANDS
            print "Sigs per Band     : ", N_SIGS / N_BANDS
            lsh = LocalitySensitiveHashing(N_BANDS, N_SIGS, KNN, USE_REDIS)
            lsh.create()
            for r in relationships:
                lsh.index(r)

        # load sentences, extract features, calculate min-hash sigs, index in bands
        else:
            print "Loading PoS tagger"
            model = open('postagger/datasets/cintil-reduced-tagset.pkl', "rb")
            pos_tagger = pickle.load(model)
            model.close()

            print "Loading verbs conjugations"
            f_verbs = open('verbs/verbs_conj.pkl', "rb")
            verbs = pickle.load(f_verbs)
            f_verbs.close()

            extractor = FeatureExtractor(pos_tagger, verbs)
            #extractor = FeatureExtractor(None, None)
            print "Extracting features from training data and calculating min-hash sigs"
            relationships = load_training_relationships(sys.argv[2], extractor)
            print "\n"
            print "Indexing ", len(relationships), "relationships"
            print "MinHash Signatures: ", N_SIGS
            print "Bands             : ", N_BANDS
            lsh = LocalitySensitiveHashing(N_BANDS, N_SIGS, KNN, USE_REDIS)
            lsh.create()
            for r in relationships:
                lsh.index(r)

コード例 #5

ファイルを表示

def process_training_data(data_file, n_sigs, n_bands):
    print "Extracting features from training data and indexing in LSH\n"
    print "MinHash Signatures : ", n_sigs
    print "Bands              : ", n_bands
    print
    lsh = LocalitySensitiveHashing(n_bands, n_sigs)
    lsh.create()

    # parallelization
    # read file into a queue
    # each process runs a FeatureExtractior
    manager = multiprocessing.Manager()
    queue = manager.Queue()
    print "\nLoading sentences from file"
    f_sentences = codecs.open(data_file, encoding='utf-8')
    count = 0
    for line in f_sentences:
        if line.startswith("#") or line.startswith("\n"):
            continue
        count += 1
        if count % 10000 == 0:
            sys.stdout.write(".")
        queue.put(line.strip())
    f_sentences.close()
    print queue.qsize(), "sentences loaded"

    num_cpus = 12
    relationships = []
    pipes = [multiprocessing.Pipe(False) for _ in range(num_cpus)]
    processes = [
        multiprocessing.Process(target=extract_features,
                                args=(queue, lsh, pipes[i][1]))
        for i in range(num_cpus)
    ]

    print "\nIndexing relationship instances from sentences"
    print "Running", len(processes), " processes"
    start_time = time.time()
    for proc in processes:
        proc.start()

    for i in range(len(pipes)):
        data = pipes[i][0].recv()
        child_instances = data
        for x in child_instances:
            relationships.append(x)
        pipes[i][0].close()

    for proc in processes:
        proc.join()

    elapsed_time = time.time() - start_time
    sys.stdout.write("Processed " + str(count) +
                     " in %.2f seconds" % elapsed_time + "\n")

    # write shingles to file
    f_features = open("features.txt", "w")
    for rel in relationships:
        f_features.write(
            str(rel[1]) + '\t' + rel[0].decode("utf8") + '\t' +
            ' '.join(rel[3]) + '\n')
    f_features.close()

    return relationships