Ejemplo n.º 1
0
def main():

    args = read_args()
    logging.info("Reading the file. This may take a few mintues")
    
    f_in, file_size = read_file(args.file_in, args.size)
    
    f = DNAFountain(file_in = f_in, 
                    file_size = file_size, 
                    chunk_size = args.size , 
                    rs = args.rs, 
                    max_homopolymer = args.max_homopolymer,
                    gc = args.gc,
                    delta = args.delta, 
                    c_dist = args.c_dist,
                    np = args.rand_numpy,
                    alpha = args.alpha, 
                    stop = args.stop)

    logging.info("Upper bounds on packets for decoding is %d (x%f)  with %f probability\n", int(json.loads(f.PRNG.debug())['K_prime']), 
                                                                                           json.loads(f.PRNG.debug())['Z'],
                                                                                           json.loads(f.PRNG.debug())['delta'])
    if (args.out == '-'):
        out = sys.stdout

    else: 
        out = open (args.out, 'w')
        pbar = tqdm(total= f.final, desc = "Valid oligos")

    prepare(args.max_homopolymer)
    
    used_bc = dict()

    
    while f.good < f.final:
        print('f.good',f.good,'f.final',f.final)
        d = f.droplet()
        #print('droplet is',d)
        print('f.screen(d) is', f.screen(d))
        if f.screen(d):
            print('in f.screen, d is',d)
            if not args.no_fasta:
                out.write(">packet {}_{}\n".format(f.good, d.degree))
            out.write("{}\n".format(d.to_human_readable_DNA()))

            if d.seed in used_bc:
                logging.error("Seed %d has been seen before\nDone", d.seed)
                sys.exit(1)

            used_bc[d.seed] = 1
         
            if (args.out != '-'):
                pbar.update()

    if (args.out != '-'):
        pbar.close()
    logging.info("Finished. Generated %d packets out of %d tries (%.3f)", f.good, f.tries, (f.good+0.0)/f.tries)

    out.close()
Ejemplo n.º 2
0
def read_pat(filename, elemsep, linesep, pat):
    filenames = glob.glob(filename + pat)
    print(filenames)
    data = [
        np.array(pp.read_file(filename, elemsep, linesep, "all"))
        for filename in filenames
    ]
    return data
Ejemplo n.º 3
0
def main(args):
    filename = args.filename
    datatype = args.datatype

    if datatype == "SEQUENTIAL":
        #pattern = 'PL1331LAGLX6PH.csv'
        pattern = args.pattern

        filenames = glob.glob(filename + pattern)
        #print(filename+pattern)
        #print(filenames)

        data = [
            pp.read_file(filename,
                         elemsep=args.elemsep,
                         linesep=args.linesep,
                         readlines=args.readlines,
                         mapping=lambda x: x) for filename in filenames
        ]
        for dat in data:
            dat.remove([])

        name_location = 1
        stream_count_location = 3

        for dat, filename in zip(data, filenames):
            try:
                stream_stats = np.array(
                    [int(row[stream_count_location]) for row in dat])
            except ValueError:
                print(filename)
                stream_stats = [0]

            name = dat[0][name_location]

            if np.max(stream_stats) > 3e+6:
                plt.figure()
                plt.title(name)
                plt.axis([0, 40, 0, 1e+7])
                plt.plot(stream_stats)

        plt.show()

        #numeric_per_row = [pp.count_numeric(row) for row in data[0]]
        #print("Max numeric: " + str(max(numeric_per_row)))
        #print("Argmax numeric: " + str(np.argmax(numeric_per_row)))

    elif datatype == "INSTANCE":
        pattern = '[a-z0-9-]*.csv'
        #pattern = args.pattern
        #serial_number = "MJ0351YNG9Z0XA"
        location = 4

        melt_instance(args, filename, pattern, location)
Ejemplo n.º 4
0
                        for ngramRange in [(1, 1), (1, 2), (1, 3)]:
                            for tfidfFlags in [(False, False, False),
                                               (True, False, False),
                                               (False, False, True)]:
                                parametersList.append(
                                    Parameters(lowerCaseFlag,
                                               removeStopWordsFlag, stemFlag,
                                               testSize, maxFeatures,
                                               ngramRange, tfidfFlags))

    original_stdout = sys.stdout  # Save a reference to the original standard output

    print("Using test param[0]")

    # Find optimal params.
    fileData = preprocessing.read_file("../input-functional.txt")
    Corpus, X, names = getInfoFromParameters(fileData, parametersList[0])
    # Encoder = LabelEncoder()
    # Y = Encoder.fit_transform(Corpus["Class"])
    Y = Corpus["Class"]
    TrainX, TestX, TrainY, TestY = model_selection.train_test_split(
        X, Y, test_size=parametersList[0].testSize)

    # Some prints.
    # print("Corpus[Comment][1]: ",Corpus["Comment"][1])
    # print("Corpus[Class][1]: ",Corpus["Class"][1])
    # print("X[1]: ",X[1])
    # print("Y[1]: ",Y[1])
    # print("names: ",names)

    # print("Corpus[Class]: ",Corpus["Class"])
Ejemplo n.º 5
0
def main():
    args = read_args()
    logging.info("Reading the file. This may take a few mintues")
    
    f_in, file_size = read_file(args.file_in, args.size)
    comp = None
    if args.composite_DNA is not None:
        # alphabet is a dict of int->letter including the std 0->A,1->C,2->G,3->T
        # the composite alphabet file only contains an ordered list of the *additional* letters
        alphabet = read_composite_alphabet(args.composite_DNA[0])
        BC_bases = int(args.composite_DNA[1])
        # TODO - set max binary block limit somehow, get oligo length from somewhere
        if args.composite_encoder is not None:
            composite_encoder = create_composite_encoder(alphabet, int(args.composite_encoder[0]), int(args.composite_encoder[1]))
        else:
            composite_encoder = create_optimal_composite_encoder(alphabet, 10, 136)
        comp = {'alphabet' : alphabet, 'BC_bases': BC_bases,'encoder':composite_encoder}

    f = DNAFountain(file_in = f_in, 
                    file_size = file_size, 
                    chunk_size = args.size , 
                    rs = args.rs, 
                    max_homopolymer = args.max_homopolymer,
                    gc = args.gc,
                    delta = args.delta, 
                    c_dist = args.c_dist,
                    np = args.rand_numpy,
                    alpha = args.alpha, 
                    stop = args.stop,
                    comp = comp,
                    maxseed = args.maxseed)

    logging.info("Upper bounds on packets for decoding is %d (x%f)  with %f probability\n", int(json.loads(f.PRNG.debug())['K_prime']), 
                                                                                           json.loads(f.PRNG.debug())['Z'],
                                                                                           json.loads(f.PRNG.debug())['delta'])
    if (args.out == '-'):
        out = sys.stdout

    else: 
        out = open (args.out, 'w')
        pbar = tqdm(total= f.final, desc = "Valid oligos")

    prepare(args.max_homopolymer)
    
    used_bc = dict()

    while f.good < f.final:
        d = f.droplet()


        if f.screen(d):
            if not args.no_fasta:
                out.write(">packet {}_{}\n".format(f.good, d.degree))
            out.write("{}\n".format(d.to_human_readable_DNA(comp = f.comp)))
            if d.seed in used_bc:
                logging.error("Seed %d has been seen before\nDone", d.seed)
                sys.exit(1)

            used_bc[d.seed] = 1
         
            if (args.out != '-'):
                pbar.update()

    if (args.out != '-'):
        pbar.close()
    logging.info("Finished. Generated %d packets out of %d tries (%.3f)", f.good, f.tries, (f.good+0.0)/f.tries)

    out.close()
Ejemplo n.º 6
0
def main():
    args = read_args()

    if args.debug_barcodes:
        valid_barcodes = load_barcodes(args)
    comp = None
    if args.composite_DNA is not None:
        # alphabet is a dict of int->letter including the std 0->A,1->C,2->G,3->T
        # the composite alphabet file only contains an ordered list of the *additional* letters
        alphabet = read_composite_alphabet(args.composite_DNA[0])
        BC_bases = int(args.composite_DNA[1])
        # TODO - set max binary block limit somehow, get oligo length from somewhere
        if args.composite_encoder is not None:
            composite_encoder = create_composite_encoder(
                alphabet, int(args.composite_encoder[0]),
                int(args.composite_encoder[1]))
        else:
            composite_encoder = create_optimal_composite_encoder(
                alphabet, 10, 136)
        comp = {
            'alphabet': alphabet,
            'BC_bases': BC_bases,
            'encoder': composite_encoder
        }

    truth = None
    if args.truth is not None:
        truth, file_size = read_file(args.truth, args.size)

    g = Glass(args.chunk_num,
              header_size=args.header_size,
              rs=args.rs,
              c_dist=args.c_dist,
              delta=args.delta,
              flag_correct=not (args.no_correction),
              gc=args.gc,
              max_homopolymer=args.max_homopolymer,
              max_hamming=args.max_hamming,
              decode=not (args.mock),
              chunk_size=args.size,
              np=args.rand_numpy,
              truth=truth,
              out=args.out,
              comp=comp)

    line = 0
    errors = 0
    seen_seeds = defaultdict(int)

    # pbar = tqdm(total= args.chunk_num, desc = "Valid oligos")
    if args.file_in == '-':
        f = sys.stdin
    else:
        try:
            f = open(args.file_in, 'r')
        except:
            logging.error("%s file not found", args.text_file)
            sys.exit(0)

    aggressive = None
    if args.aggressive:
        aggressive = Aggressive(g=g, file_in=f, times=args.aggressive)

    ######## Main loop
    while True:
        try:
            dna = f.readline().rstrip('\n')
        except:
            logging.info("Finished reading input file!")
            break

        if len(dna) == 0:
            logging.info("Finished reading input file!")
            break

        if (args.fasta and re.search(r"^>", dna)):
            continue

        coverage = 0
        # when the file is in the format of coverage \t DNA
        if (len(dna.split()) == 2):
            coverage, dna = dna.split()
            ####Aggresive mode
            if aggressive is not None and aggressive.turn_on(
                    int(coverage), seen_seeds):
                best_file, value = aggressive.start()
                if best_file is not None:
                    copyfile(best_file, args.out)
                    logging.info("Done!")
                else:
                    logging.error("Could not decode all file...")

                sys.exit(1)
                ### End of aggressive mode

        if 'N' in dna and comp is None:
            continue

        line += 1
        seed, data = g.add_dna(dna)

        if seed == -1:  # reed-solomon error!
            errors += 1
        else:
            # pbar.update()
            if args.debug_barcodes:
                if not dna in valid_barcodes:
                    logging.error(
                        "Seed or data %d in line %d are not valid:%s", seed,
                        line, dna)
                else:
                    seen_seeds[dna] += 1
            else:
                seen_seeds[seed] += 1

        if line % 10000 == 0:
            logging.info(
                "After reading %d lines, %d chunks are done. So far: %d rejections (%f) %d barcodes",
                line, g.chunksDone(), errors, errors / (line + 0.0),
                g.len_seen_seed())
            pass

        if line == args.max_line:
            logging.info("Finished reading maximal number of lines")
            break

        if g.isDone():
            logging.info(
                "After reading %d lines, %d chunks are done. So far: %d rejections (%f) %d barcodes",
                line, g.chunksDone(), errors, errors / (line + 0.0),
                g.len_seen_seed())
            logging.info("Done!")
            break

    if not g.isDone():
        logging.error("Could not decode all file...")
        sys.exit(1)

    outstring = g.getString()
    f = open(args.out, 'wb')
    f.write(outstring)
    f.close()

    logging.info("MD5 is %s", md5.new(outstring).hexdigest())

    json.dump(seen_seeds,
              open("seen_barocdes.json", 'w'),
              sort_keys=True,
              indent=4)
Ejemplo n.º 7
0
    files = [
        open("datasets/en_ewt-ud-{}.conllu".format(glob.train_mode), 'r'),
        open("datasets/fr_gsd-ud-{}.conllu".format(glob.train_mode), 'r'),
        open("datasets/es_gsd-ud-{}.conllu".format(glob.train_mode), 'r')
    ]

    files_test = [
        open("datasets/en_ewt-ud-{}.conllu".format(glob.test_mode), 'r'),
        open("datasets/fr_gsd-ud-{}.conllu".format(glob.test_mode), 'r'),
        open("datasets/es_gsd-ud-{}.conllu".format(glob.test_mode), 'r')
    ]

    sentences = []
    sentences_test = []
    for f, f_test in zip(files, files_test):
        sentences += read_file(f)
        sentences_test += read_file(f_test)

else:
    raw_file = open("datasets/fr_gsd-ud-{}.conllu".format(glob.train_mode),
                    'r')
    raw_file_test = open("datasets/fr_gsd-ud-{}.conllu".format(glob.test_mode),
                         'r')

    sentences = read_file(raw_file)
    sentences_test = read_file(raw_file_test)

dico = build_dictionary(sentences)
"""___________________________________ INITIALIZING MODEL _______________________________________"""

Net = XLMEmbMultiLayerBiLSTMPosTagger(dico)
Ejemplo n.º 8
0
def run():
    config = Config()
    save_path = "trained_model/saved_model"

    x_train_path = 'data/xtrain.txt'
    y_train_path = 'data/ytrain.txt'

    x_idx = prep.Indexer()

    X = prep.read_file(x_train_path, raw=True)
    y = prep.read_file(y_train_path, label=True)

    t = CountVectorizer(analyzer='char',
                        ngram_range=(config.ngram_min, config.ngram_max))

    X = np.array(
        pad_sequences(
            x_idx.transform(t.inverse_transform(t.fit_transform(X)),
                            matrix=True), config.maxlen))

    x_train, x_test, y_train, y_test = train_test_split(
        X, y, test_size=config.test_size, shuffle=config.shuffle)

    del X, y

    # Generate batches
    train_batches = prep.generate_instances(data=x_train,
                                            labels_data=y_train,
                                            n_word=x_idx.max_number() + 1,
                                            n_label=config.label_size,
                                            max_timesteps=config.max_timesteps,
                                            batch_size=config.batch_size)
    validation_batches = prep.generate_instances(
        data=x_test,
        labels_data=y_test,
        n_word=x_idx.max_number() + 1,
        n_label=config.label_size,
        max_timesteps=config.max_timesteps,
        batch_size=config.batch_size)

    # Train the model
    train.train(config,
                train_batches,
                validation_batches,
                x_idx.max_number() + 1,
                save_path,
                from_saved=True)

    # Final Validation
    prediction_batches = prep.generate_instances(
        data=x_test,
        labels_data=None,
        n_word=x_idx.max_number() + 1,
        n_label=config.label_size,
        max_timesteps=config.max_timesteps,
        batch_size=config.batch_size)

    # Predict the model
    predicted_labels = predict.predict(config, prediction_batches,
                                       x_idx.max_number() + 1, save_path)

    report = classification_report(y_test[:len(predicted_labels)],
                                   predicted_labels)
    print(report)

    # Final output

    x_test_path = 'data/xtest.txt'

    X = prep.read_file(x_test_path, raw=True)

    t = CountVectorizer(analyzer='char',
                        ngram_range=(config.ngram_min, config.ngram_max))

    X = np.array(
        pad_sequences(
            x_idx.transform(t.inverse_transform(t.fit_transform(X)),
                            matrix=True,
                            add_if_new=False), config.maxlen))

    prediction_batches = prep.generate_instances(
        data=X,
        labels_data=None,
        n_word=x_idx.max_number() + 1,
        n_label=config.label_size,
        max_timesteps=config.max_timesteps,
        batch_size=config.batch_size)

    predicted_labels = predict.predict(config,
                                       prediction_batches,
                                       x_idx.max_number() + 1,
                                       save_path,
                                       write_to_file=True)
Ejemplo n.º 9
0
#Main should just call the other files

import support_vector_machine as svm
import knn as knn
import preprocessing as preprocessing
import CSV_creator as csv_maker
from pylab import *

print('starting...')

file_path = csv_maker.read()

data_frame = preprocessing.read_file(file_path)
data_frame = preprocessing.clean(data_frame)

score_averages = svm.get_plot_feature_scores(data_frame)

plt.plot(score_averages)
plt.ylabel("score averages")
plt.xlabel("number of features")

plt.show()

score_averages = knn.get_plot_feature_scores(data_frame)

plt.plot(score_averages)
plt.ylabel("score averages")
plt.xlabel("number of features")

plt.show()
Ejemplo n.º 10
0
import preprocessing
import n_gram
import another_method
import OWLWriter


if __name__ == "__main__":
    wrdlst = preprocessing.get_wordlist()
    preprocessing.read_file(wrdlst)
    raw_data = preprocessing.removestopwords()
    unigram = n_gram.uni_gram(raw_data)
    bigram = n_gram.bi_gram(raw_data)
    OWLWriter.generate_owl_file(bigram)
    another_method.manual(bigram)

Ejemplo n.º 11
0
def test_incre(data_file,
               link_file,
               concept_file,
               file_type,
               incre_course_num,
               incre_concept_num,
               undirect=False,
               save=True,
               update_A1=False):
    from preprocessing import generate_triple, generate_trn, row_normlize, read_file
    X, links, concept = read_file(data_file,
                                  link_file,
                                  concept_file=concept_file,
                                  file_type=file_type)
    X = X.todense()
    n_course, n_concept = X.shape[0], X.shape[1]
    trn = generate_trn(links, n_course, undirect=undirect)
    tripple = generate_triple(trn)
    split_tripple_list = split_tripple(
        tripple, range(X.shape[0] - incre_course_num, X.shape[0]))
    # 找到一个只有最后一门课才有的词
    X = row_normlize(X)
    # 先训不带增量最后一行的
    # 用全量数据训练模型
    import model
    A0, F0, st = model.cgl_rank(X,
                                tripple,
                                lamb=0.01,
                                eta=1,
                                tolerence=1,
                                silence=False)
    print('finish training whole A\n\n')
    # 用增量数据训练模型
    T = split_tripple_list[0]
    A, F, st = model.cgl_rank(X[:-incre_course_num, :-incre_concept_num],
                              T,
                              lamb=0.01,
                              eta=1,
                              tolerence=1,
                              silence=False)

    A = gene_incre_matrix(A, incre_concept_num)
    print('\n\n\n')
    A1, F1, st = incre_cgl_rank_new(
        X, (n_course - incre_course_num, n_concept - incre_concept_num),
        tripple,
        split_tripple_list,
        A,
        eta=5,
        lamb=0.01,
        tolerrence=1,
        update_A1=update_A1)
    file_prefix = 'undirect' if undirect else 'direct'
    file_prefix += '_update_A1' if update_A1 else 'noupdate_A1'
    if save:
        np.savetxt('result/ruc_A_whole_with_essay_{}.txt'.format(file_prefix),
                   A0)
        np.savetxt('result/ruc_A_incre_with_essay_{}.txt'.format(file_prefix),
                   A1)

        np.savetxt('result/ruc_F_whole_with_essay_{}.txt'.format(file_prefix),
                   F0)
        np.savetxt('result/ruc_F_incre_with_essay_{}.txt'.format(file_prefix),
                   F1)
Ejemplo n.º 12
0
@application.route('/')
def index():
    return render_template('index.html')


@application.route('/search/', methods=['POST'])
def search():
    query = request.form['query']
    docs = get_top_k_docs(model, query, corpus, k=100)

    return jsonify(docs)


@application.route('/save_relevance/', methods=['POST'])
def save_relevance():
    query = request.form['query']
    doc_id = request.form['doc_id']
    ip = request.form['ip']
    is_rel = request.form['is_rel']

    store_relevance_judgements(query, doc_id, ip, is_rel)
    return ('', 204)


if __name__ == '__main__':
    print('Loading Corpus...')
    corpus, tokenized_corpus = read_file()
    model = BM25Plus(tokenized_corpus)
    print('Corpus Loaded!')

    application.run()
Ejemplo n.º 13
0
def main(args):
    filename = args.filename
    names = pp.just_the_names([filename])
    print(names)

    if args.model == "ESN":
        lag = 1
    elif args.model == "SVM":
        lag = 4
    elif args.model == "MLP":
        lag = 5

    global name
    name = names[0]

    data_filename = filename + ".data"
    event_filename = filename + ".events"

    data = pp.read_file(data_filename,
                        elemsep=",",
                        linesep="\n",
                        readlines="all",
                        mapping=lambda x: x)
    events = pp.read_file(event_filename,
                          elemsep=",",
                          linesep="\n",
                          readlines="all",
                          mapping=lambda x: x)

    #data.remove([])
    data = list(filter(([]).__ne__, data))

    if name == "CalIt2":
        data = [row[1:] for row in data]

    event_times = [
        Time_interval(event[0], event[1], event[2]) for event in events
    ]
    gt = generate_gt(data, event_times)
    gt = gt[lag:]

    if name == "Dodgers":
        intensity = np.array([int(row[2]) for row in data])
        #data = [row for row,ints in zip(data,intensity) if ints != -1]
        T = 288
        num_subspace = 2
        #intensity = intensity[np.where(intensity!=-1)[0]]
        raw_intensity = intensity.reshape([len(intensity), 1])
        intensity, gt = preprocess(intensity, gt, T, num_subspace, lag)
    elif name == "CalIt2":
        for row in data:
            if len(row) < 3:
                print(row)
        intensity = np.array([int(row[2]) for row in data])
        gt = gt[::2]
        #data = [row[1:] for row,ints in zip(data,intensity) if 1 ] #ints != -1]
        T = 48
        num_subspace = 3
        raw_intensity = intensity.reshape([int(len(intensity) / 2), 2])
        intensity_out, gt = preprocess(raw_intensity[:, 0], gt, T,
                                       num_subspace, lag)
        intensity_in, __ = preprocess(raw_intensity[:, 1], gt, T, num_subspace,
                                      lag)
        intensity = np.concatenate([intensity_in, intensity_out], axis=1)
    '''
	print(gt.shape)
	plt.plot(intensity[:,-1])
	plt.plot(gt)
	plt.show()	
	'''

    explanations = ["intensity"]

    if 0:  #__name__ == '__main__':
        plt.figure()
        plt.plot(raw_intensity[:, 0] / 10, "b")
        plt.plot(intensity[:, 0], "g")
        plt.plot(gt, "r")
        plt.title("{0:s}".format(names[0]))
        plt.xlabel("Sample no. (time)")
        if name == "CalIt2":
            plt.legend([
                "Raw intensity in flow", "Processed intensity",
                "Event in building"
            ])
            plt.ylabel("No. people/ 3 mins")
        else:
            plt.legend([
                "Raw intensity flow", "Processed intensity", "Event at stadium"
            ])
            plt.ylabel("No. cars/ 5 secs")

        if name == "CalIt2":
            plt.figure()
            plt.plot(raw_intensity[:, 1] / 10, "k")
            plt.plot(intensity[:, 1], "g")
            plt.plot(gt, "r")
            plt.legend([
                "Raw intensity out flow", "Processed intensity",
                "Event at stadium"
            ])
            plt.xlabel("Sample no. (time)")
            plt.ylabel("No. people/ 3 mins")

        plt.show()

    return [intensity], [gt], explanations, names
Ejemplo n.º 14
0
def main(args):
    filename = args.filename
    datatype = args.datatype

    if datatype == "SEQUENTIAL":
        #pattern = 'PL1331LAGLX6PH.csv'
        pattern = args.pattern

        filenames = glob.glob(filename + pattern)
        #print(filename+pattern)
        #print(filenames)

        data = [
            pp.read_file(filename,
                         elemsep=args.elemsep,
                         linesep=args.linesep,
                         readlines=args.readlines) for filename in filenames
        ]
        for dat in data:
            dat.remove([])
        data = [np.array(dat) for dat in data]

        #numeric_per_row = [pp.count_numeric(row) for row in data[0]]
        #print("Max numeric: " + str(max(numeric_per_row)))
        #print("Argmax numeric: " + str(np.argmax(numeric_per_row)))

        if __name__ == '__main__':
            names = just_the_names(filenames)
            # Data selection
            data = [dat[:, normalized_idx] for dat in data]
            #dead_rows(data,filenames)
            qualified = BB_SMART_order
            #data,__ = remove_caught_failures(data,names,qualified)

            #print(max_length)
            #return

            #data = filter_wrt(data,0,2)

            #while True:
            #	x = input('Which feature do you want to look at? ')
            #	x = int(x)
            data = pp.normalize_all(data, leave_zero=True)
            #plot_featurewise(data,names)
            plot_unitwise(data, names)

        else:

            names = just_the_names(filenames)
            # Data selection
            data = [dat[:, normalized_idx] for dat in data]
            dead_rows(data, names)

            idxs = set(all_smart_except([194, 5, 187, 188, 197, 198]))
            print(idxs)
            #print(set(pp.numeric_idxs(data)))
            idxs = set.intersection(idxs, set(pp.numeric_idxs(data)))
            print(idxs)
            #print(pp.changing_idxs(data))
            idxs = set.intersection(idxs, set(pp.changing_idxs(data)))
            print(idxs)
            print("Qualified indexes: " + str(sorted(idxs)))
            print("Qualified indexes: " +
                  str(sorted([BB_SMART_order[i] for i in idxs])))
            qualified = sorted([BB_SMART_order[i] for i in idxs])

            data = [dat[:, sorted(idxs)] for dat in data]
            keys = [smart_expl(i)[0] for i in sorted(idxs)]
            explanations = [smart_expl(i)[1] for i in sorted(idxs)]

            print("before removing missing, small, and predicted failures: " +
                  str(len(data)))
            __, no_missing = pp.remove_instances_with_missing(data)
            min_sample_time = args.settings["failure_horizon"] + 70
            __, no_small = pp.remove_small_samples(data, min_sample_time)
            #print(no_small)
            if 1:
                __, no_predicted_failures = remove_caught_failures(
                    data, names, qualified)
            else:
                no_predicted_failures = list(range(len(data)))
            cleared_idxs = set.intersection(set(no_missing), set(no_small),
                                            set(no_predicted_failures))
            cleared_idxs = list(cleared_idxs)
            #print(cleared_idxs)
            data = [data[idx] for idx in sorted(cleared_idxs)]
            #print([len(dat) for dat in data])
            names = [names[idx] for idx in sorted(cleared_idxs)]
            print("after removing missing, small, and predicted failures: " +
                  str(len(data)))

            # Mathematical preprocessing
            # add 0 to beginning
            #num_features = len(idxs)
            #data = [np.concatenate([np.zeros([1,num_features]),dat],axis=0) for dat in data]

            extended_features = False
            if extended_features:
                exta = pp.differentiate(data)
                #data = pp.smooth(data,5)
                exta = pp.filter(exta, np.array([1]), np.array([1, -0.8]))
                data = [
                    dat[1:, :] for dat in data
                ]  # have to take a away first so that lengths are correct
                #print(exta[0].shape)
                #print(data[0].shape)
                data = [
                    np.concatenate([dat, ext], axis=1)
                    for dat, ext in zip(data, exta)
                ]

                expl_ext = [expl + " (modified)" for expl in explanations]
                explanations = explanations + expl_ext
            else:
                pass
                #data = pp.differentiate(data)
                #data = pp.smooth(data,10)
                #data = pp.filter(data,np.array([1]),np.array([1,-0.8]))

            data = pp.normalize_all(data, leave_zero=True, mean_def=0)
            #print(explanations)
            #print(keys)
            print("Explanations " + " ".join([
                "{0:s}: {1:s}".format(str(key), str(explanation))
                for key, explanation in zip(keys, explanations)
            ]))

            if args.test_type == "PREDICTION":
                gt = []
            elif args.test_type in ["CLASSIFICATION", "REGRESSION"]:
                X = []
                Y = []
                failed = ["_fail" in name for name in names]

                for x, y in pp.impending_failure(
                        data, failed, args.settings["failure_horizon"],
                        "CLASSIFICATION"):
                    X.append(x)
                    Y.append(y)

                #order = np.random.choice(len(X),len(X),replace=False)
                #data = [X[ord_i] for ord_i in order]
                #gt = [Y[ord_i] for ord_i in order]
                #data = X
                gt = Y

            return data, gt, explanations, names

    elif datatype == "INSTANCE":
        pattern = '[0-9-]*.csv'
        #pattern = args.pattern
        #serial_number = "MJ0351YNG9Z0XA"
        serial_location = 1
        model_location = 2

        melt_instance(args, filename, pattern, serial_location, model_location)
    elif datatype == "POPULATION_STATISTICS":
        product_failures(args, filename)
Ejemplo n.º 15
0
def main():

    args = read_args()

    if args.debug_barcodes:
        valid_barcodes = load_barcodes(args)

    truth = None
    if args.truth is not None:
        truth, file_size = read_file(args.truth, args.size)

    g = Glass(args.chunk_num,
              header_size=args.header_size,
              rs=args.rs,
              c_dist=args.c_dist,
              delta=args.delta,
              flag_correct=not (args.no_correction),
              gc=args.gc,
              max_homopolymer=args.max_homopolymer,
              max_hamming=args.max_hamming,
              decode=not (args.mock),
              exDNA=args.expand_nt,
              chunk_size=args.size,
              np=args.rand_numpy,
              truth=truth,
              out=args.out)

    line = 0
    errors = 0
    seen_seeds = defaultdict(int)

    #pbar = tqdm(total= args.chunk_num, desc = "Valid oligos")
    if args.file_in == '-':
        f = sys.stdin
    else:
        try:
            f = open(args.file_in, 'r')
        except:
            logging.error("%s file not found", args.text_file)
            sys.exit(0)

    aggressive = None
    if args.aggressive:

        aggressive = Aggressive(g=g, file_in=f, times=args.aggressive)

    ######## Main loop
    while True:

        try:
            dna = f.readline().rstrip('\n')
        except:
            logging.info("Finished reading input file!")
            break

        if len(dna) == 0:
            logging.info("Finished reading input file!")
            break

        if (args.fasta and re.search(r"^>", dna)):
            continue

        coverage = 0
        #when the file is in the format of coverage \t DNA
        if (len(dna.split()) == 2):
            coverage, dna = dna.split()
            ####Aggresive mode
            if aggressive is not None and aggressive.turn_on(
                    int(coverage), seen_seeds):
                best_file, value = aggressive.start()
                if best_file is not None:
                    copyfile(best_file, args.out)
                    logging.info("Done!")
                else:
                    logging.error("Could not decode all file...")

                sys.exit(1)
            ### End of aggressive mode

        if 'N' in dna:
            continue

        line += 1
        seed, data = g.add_dna(dna)

        if seed == -1:  #reed-solomon error!
            errors += 1
        else:
            #pbar.update()
            if args.debug_barcodes:
                if not dna in valid_barcodes:
                    logging.error(
                        "Seed or data %d in line %d are not valid:%s", seed,
                        line, dna)
                else:
                    seen_seeds[dna] += 1
            else:
                seen_seeds[seed] += 1

        if line % 1000 == 0:
            logging.info(
                "After reading %d lines, %d chunks are done. So far: %d rejections (%f) %d barcodes",
                line, g.chunksDone(), errors, errors / (line + 0.0),
                g.len_seen_seed())
            pass

        if line == args.max_line:
            logging.info("Finished reading maximal number of lines")
            break

        if g.isDone():
            logging.info(
                "After reading %d lines, %d chunks are done. So far: %d rejections (%f) %d barcodes",
                line, g.chunksDone(), errors, errors / (line + 0.0),
                g.len_seen_seed())
            logging.info("Done!")
            break

    if not g.isDone():
        logging.error("Could not decode all file...")
        sys.exit(1)

    outstring = g.getString()
    f = open(args.out, 'wb')
    f.write(outstring)
    f.close()

    logging.info("MD5 is %s", md5.new(outstring).hexdigest())

    json.dump(seen_seeds,
              open("seen_barocdes.json", 'w'),
              sort_keys=True,
              indent=4)
                                        stemFlag, maxFeatures, ngramRange,
                                        tfidfFlags, alpha_value))

    cnt = 0

    # Go through all of the input files and configurations and export the results to a .csv file.
    for input_file, output_file_path, singleFunctionalClass in [
        ("../input.txt", "output/outputNBdirectAlphaAll.csv", False),
        ("../input-functional.txt",
         "output/outputNBdirectAlphaFunctionalAll.csv", True)
    ]:
        with open(output_file_path, 'w') as output:
            print(utilities.getHeader(singleFunctionalClass), file=output)
            output.flush()

            fileData = preprocessing.read_file(input_file)

            for parameters in parametersList:
                print(cnt, ' / ', len(parametersList))
                # datetime object containing current date and time
                print(">>>>>>>>>>>>>>>>>>>>> get info start. now =",
                      datetime.now())

                classifier = MultinomialNB(alpha=parameters.alphaNaiveBayes)
                Corpus, pipeline = utilities.getInfoFromParameters(
                    fileData, parameters, classifier)

                outer_cv = StratifiedKFold(n_splits=10,
                                           shuffle=True,
                                           random_state=42)
Ejemplo n.º 17
0
def main(args):
    filename = args.filename
    datatype = args.datatype

    if datatype == "SEQUENTIAL":
        #pattern = 'PL1331LAGLX6PH.csv'
        pattern = args.pattern

        filenames = glob.glob(filename + pattern)
        filenames = sorted(filenames, reverse=True)
        names = just_the_names(filenames)
        #print(filename+pattern)
        #print(sorted(filenames,reverse=False))
        print(names)

        feature_idxs = [2, 3, 4, 5, 6]
        gt_idx = [7]

        data = [
            pp.read_file(filename,
                         elemsep=args.elemsep,
                         linesep=args.linesep,
                         readlines=args.readlines)
            for filename in sorted(filenames, reverse=True)
        ]
        for dat in data:
            dat.remove([])

        data = [np.array(dat) for dat in data]
        data = [dat[1:, :] for dat in data]
        '''
		for feat in feature_idxs:
			plt.figure()
			for dat in data:
				plt.plot(dat[:,feat])

		plt.show()
		'''
        '''
		data = pp.normalize_all(data,leave_zero=True)	
		
		for dat in data:
			plt.figure()
			for feat in feature_idxs:
				plt.plot(dat[:,feat])
			gt_array = dat[:,gt_idx]
			gt_array = gt_array - min(gt_array)
			gt_array = gt_array/max(gt_array)
			plt.plot(gt_array)
			plt.title("Occupancy and predictors")
			plt.xlabel("Sample no. (time)")
			plt.ylabel("Value (normalized)")
			plt.legend(explanations+["occupancy"])

		plt.show()
		'''

        gt = [dat[:, gt_idx] for dat in data]
        data = [dat[:, feature_idxs] for dat in data]

        data = pp.normalize_all(data, leave_zero=True)

        return data, gt, explanations, names
Ejemplo n.º 18
0
def run():

    config = Config()  # Load configs

    save_path = 'keras_models/keras_model'  # Model save path

    x_train_path = 'data/xtrain.txt'
    x_test_path = 'data/xtest.txt'
    y_train_path = 'data/ytrain.txt'

    x_idx = prep.Indexer()

    X = prep.read_file(x_train_path, raw=True)
    y = prep.read_file(y_train_path, label=True)

    t = CountVectorizer(analyzer='char',
                        ngram_range=(config.ngram, config.ngram))
    t.fit(X)
    X = prep.transform(X, t, x_idx)
    X = np.array(pad_sequences(X, config.maxlen))

    x_train, x_test, y_train, y_test = train_test_split(
        X, y, test_size=config.test_size, shuffle=config.shuffle)

    #############################################
    # Train model
    print("BEGINNING TRAINING")
    tsv_logger = CSVLogger('training-data.tsv', append=True, separator='\t')

    m = model_fn(config=config, input_length=x_idx.max_number() + 1)

    # m = load_model(save_path)

    m.fit(x_train,
          y_train,
          epochs=config.n_epochs,
          batch_size=config.batch_size,
          verbose=1,
          shuffle=True,
          callbacks=[tsv_logger],
          validation_data=(x_test, y_test))

    m.save(save_path)

    print("MODEL REPORT")
    score, acc = m.evaluate(x_test, y_test)

    print("\nSCORE: ", score)
    print("ACCURACY: ", acc)

    pred = [np.argmax(label) for label in m.predict(x_test)]

    report = classification_report(y_test, pred)

    print(report)

    ###############################################
    # Predict and write labels for xtest.txt

    print("PREDICTION")

    X = prep.read_file(x_test_path, raw=True)
    X = prep.transform(X, t, x_idx, add_if_new=False)
    X = np.array(pad_sequences(X, config.maxlen))

    pred = [np.argmax(label) for label in m.predict(X)]

    with open("".join(["keras_prediction/ytest.txt"]), "w+",
              encoding="utf-8") as rec:
        for label in pred:
            rec.write("%s\n" % label)

        rec.close()
Ejemplo n.º 19
0
def main(args):
    filename = args.filename
    #datatype = args.datatype

    #if datatype == "SEQUENTIAL":
    #pattern = 'PL1331LAGLX6PH.csv'
    pattern = args.pattern

    filenames = glob.glob(filename + pattern)
    filenames = sorted(filenames, reverse=True)
    names = just_the_names(filenames)
    #print(filename+pattern)
    #print(sorted(filenames,reverse=False))
    print(names)

    feature_idxs = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
    #feature_idxs = [0,2,11,13]
    gt_idx = [14]

    data = [
        pp.read_file(filename,
                     elemsep=args.elemsep,
                     linesep=args.linesep,
                     readlines=args.readlines)
        for filename in sorted(filenames, reverse=True)
    ]
    for dat in data:
        dat.remove([])

    data = [np.array(dat) for dat in data]
    #data = [dat[1:,:] for dat in data]
    data = [neutralize_outliers(dat) for dat in data]
    '''
	gt = [dat[:,gt_idx] for dat in data]
	data = [dat[:,feature_idxs] for dat in data]	
	data = pp.normalize_all(data,leave_zero=True)

	x = np.linspace(0,117,14980)
	print(x.shape)
	for dat in data:
		print(dat.shape)
		plt.figure()
		for feat in feature_idxs:
			plt.plot(x,dat[:,feat])
		plt.plot(x,gt[0]*5,'b')

	plt.xlabel('time / s')
	plt.ylabel('Normalized EEG value')
	plt.title('EEG Eye Features')
	plt.legend(explanations+["Ground Truth"])
	plt.show()
	'''
    '''
	data = pp.normalize_all(data,leave_zero=True)	
	
	for dat in data:
		plt.figure()
		for feat in feature_idxs:
			plt.plot(dat[:,feat])
		gt_array = dat[:,gt_idx]
		gt_array = gt_array - min(gt_array)
		gt_array = gt_array/max(gt_array)
		plt.plot(gt_array)
		plt.title("Occupancy and predictors")
		plt.xlabel("Sample no. (time)")
		plt.ylabel("Value (normalized)")
		plt.legend(explanations+["occupancy"])

	plt.show()
	'''

    gt = [dat[:, gt_idx] for dat in data]
    data = [dat[:, feature_idxs] for dat in data]

    data = pp.normalize_all(data, leave_zero=True)

    return data, gt, explanations, names