sequences2 = util.get_sequences(file2)
    data, labels = create_data(sequences1, sequences2)
    return data, labels    
    
# pipeline
if __name__ == '__main__':
    
    print("Python version:\n{}\n".format(sys.version))
    print("matplotlib version: {}".format(matplotlib.__version__))
    print(plt.style.available)
    print("numpy version: {}".format(np.__version__))
    
    input_file = sys.argv[1]
    output_dir = sys.argv[2]
    util.remove_old_output_files(output_dir)
    sys.stdout = open(util.generate_output_filename(output_dir), "w")
    lines = util.readFile(input_file)
    threegrams = create_keys(3)
    twograms = create_keys(2)
    proteins = []
    accuracies = []
    for line in lines:
        if line:
            print("-----------------------------------------")
            print(line[0], line[1], line[2], line[3])
            print("start processing ", line[2], line[3])
            file1 = line[0]
            file2 = line[1]
            analysis_type = line[2]
            protein = line[3]
            util.start_roc_plot(analysis_type + '-' + protein)
def get_m1_m2_sequences(fasta_file):
    m1_sequences = [x.seq for x in SeqIO.parse(fasta_file, "fasta") if len(x.description.split("|"))>3 and x.description.split("|")[3].split(":")[1]=='M1']
    m2_sequences = [x.seq for x in SeqIO.parse(fasta_file, "fasta") if len(x.description.split("|"))>3 and x.description.split("|")[3].split(":")[1]=='M2']
    return m1_sequences, m2_sequences

def readFile(filename):
    f = open(filename)
    csv_f = csv.reader(f)
    return csv_f
    
#pipeline
if __name__ == '__main__':
    input_file = sys.argv[1]
    output_dir = sys.argv[2]
    util.remove_old_output_files(output_dir)
    sys.stdout = open(util.generate_output_filename(output_dir, basename='dna_results'), "w")
    lines = util.readFile(input_file)
    threegrams = create_keys(3)
    proteins = []
    accuracies = []
    
    for line in lines:
        if line:
            print("-----------------------------------------")
            print(line[0], line[1], line[2], line[3])
            print("start processing ", line[2], line[3])
            file1 = line[0]
            file2 = line[1]
            analysis_type = line[2]
            protein = line[3]