def create_amylpred_data(n):
    if os.path.exists("data/temp/amylpred" + str(n) + "set.txt"):
        print "Using existing data."
        return
    if not os.path.exists("data/temp/amino_acid_index.txt"):
        create_aaindex()

    create_amylpred_npeptide_data(n)
    fp = open("data/temp/amylpred" + str(n) + "peptides.txt")
    fn = open("data/temp/neg-" + str(n) + "peptides.txt")
    data = [line.rstrip() + " 1" for line in fp.readlines()]  # Positive data
    neg = [line.rstrip() + " 0" for line in fn.readlines()]  # Negative data
    data.extend(neg)
    # Shuffle the data randomly so that we can do cross-validation
    random.shuffle(data)

    # Creating a dataset with all features
    temp_amylprednset = open("data/temp/temp_amylpred" + str(n) + "set.txt",
                             "w")
    # Compute the features for each sequence and append them to the data
    for i in xrange(len(data)):
        seq_features = " ".join(
            str(e) for e in compute_features(data[i].split()[0]))
        temp_amylprednset.write(data[i] + " " + seq_features + "\n")
    temp_amylprednset.close()

    if not os.path.exists("data/temp/amylpred_feature_dataframe.csv"):
        import optimal_feature_selection as ofs
        ofs.select_optimal_features(n, "amylpred")

    feature_dataframe = pd.read_csv("data/temp/amylpred_feature_dataframe.csv",
                                    index_col=0,
                                    header=0)
    feature_ids = [x for x in feature_dataframe["id"]]
    feature_ids.extend(range(len(feature_ids), len(feature_ids) + n))

    # Compute the features for each sequence and append them to the data
    amylprednset = open("data/temp/amylpred" + str(n) + "set.txt", "w")
    for i in xrange(len(data)):
        seq_features = " ".join(
            str(e) for e in compute_features(data[i].split()[0], feature_ids))
        amylprednset.write(data[i] + " " + seq_features + "\n")
    print "The amylprednset.txt has been created."
def create_zipper_data(n):
    if os.path.exists("data/temp/zipper_hexpepset.txt"):
        print "Using existing data."
        return
    if not os.path.exists("data/temp/amino_acid_index.txt"):
        create_aaindex()

    f = open("data/test/zipper_dataset.txt")
    data = []
    for line in f:
        if line.strip()[0] == "+":
            data.append(line.split()[1] + " 1")  # Positive data
        else:
            data.append(line.split()[1] + " 0")  # Negative data
    # Shuffle the data randomly so that we can do cross-validation
    random.shuffle(data)

    # Creating a dataset with all features
    temp_zipper_hexpepset = open("data/temp/temp_zipper_hexpepset.txt", "w")
    # Compute the features for each sequence and append them to the data
    for i in xrange(len(data)):
        seq_features = " ".join(
            str(e) for e in compute_features(data[i].split()[0]))
        temp_zipper_hexpepset.write(data[i] + " " + seq_features + "\n")
    temp_zipper_hexpepset.close()

    if not os.path.exists("data/temp/zipper_feature_dataframe.csv"):
        import optimal_feature_selection as ofs
        ofs.select_optimal_features(6, "zipper")

    feature_dataframe = pd.read_csv("data/temp/zipper_feature_dataframe.csv",
                                    index_col=0,
                                    header=0)
    feature_ids = [x for x in feature_dataframe["id"]]
    feature_ids.extend(range(len(feature_ids), len(feature_ids) + n))

    # Compute the features for each sequence and append them to the data
    zipper_hexpepset = open("data/temp/zipper_hexpepset.txt", "w")
    for i in xrange(len(data)):
        seq_features = " ".join(
            str(e) for e in compute_features(data[i].split()[0], feature_ids))
        zipper_hexpepset.write(data[i] + " " + seq_features + "\n")
    print "The zipper_hexpepset.txt has been created."
def create_amylpred_data(n):
    if os.path.exists("data/temp/amylpred_hexpepset.txt"):
        print "Using existing data."
        return
    if not os.path.exists("data/temp/amino_acid_index.txt"):
        create_aaindex()
    
    f = open("data/test/amylpred_dataset.txt")
    data = []
    for line in f:
        if line.strip()[0]=="+":
            data.append(line.split()[1] + " 1") # Positive data
        else:
            data.append(line.split()[1] + " 0") # Negative data
    # Shuffle the data randomly so that we can do cross-validation
    random.shuffle(data)
    
    # Creating a dataset with all features
    temp_amylpred_hexpepset = open("data/temp/temp_amylpred_hexpepset.txt", "w")   
    # Compute the features for each sequence and append them to the data
    for i in xrange(len(data)):        
        seq_features = " ".join(str(e) for e in compute_features(data[i].split()[0]))
        temp_amylpred_hexpepset.write(data[i] + " " + seq_features + "\n")
    temp_amylpred_hexpepset.close()
    
    if not os.path.exists("data/temp/amylpred_feature_dataframe.csv"):
        import optimal_feature_selection as ofs
        ofs.select_optimal_features(6, "amylpred")
    
    feature_dataframe = pd.read_csv("data/temp/amylpred_feature_dataframe.csv", 
                                        index_col=0, header=0)
    feature_ids = [x for x in feature_dataframe["id"]]
    feature_ids.extend(range(len(feature_ids), len(feature_ids)+n))
    
    # Compute the features for each sequence and append them to the data
    amylpred_hexpepset = open("data/temp/amylpred_hexpepset.txt", "w") 
    for i in xrange(len(data)):        
        seq_features = " ".join(str(e) for e in compute_features(data[i].split()[0],
                                                                    feature_ids))
        amylpred_hexpepset.write(data[i] + " " + seq_features + "\n")
    print "The amylpred_hexpepset.txt has been created."
def create_amylnset(n):
    if os.path.exists("data/temp/amyl"+str(n)+"set.txt"):
        return
    if not os.path.exists("data/temp/amino_acid_index.txt"):
        create_aaindex()
    
    create_npeptide_data(n)
    fp = open("data/temp/"+str(n)+"peptides.txt")
    fn = open("data/temp/neg-"+str(n)+"peptides.txt")
    data = [line.rstrip() + " 1" for line in fp.readlines()] # Positive data
    neg = [line.rstrip() + " 0" for line in fn.readlines()] # Negative data
    data.extend(neg)
    # Shuffle the data randomly so that we can do cross-validation
    random.shuffle(data)

    # Creating dataset with all features.
    temp_amylnset = open("data/temp/temp_amyl"+str(n)+"set.txt", "w")
    # Compute the features for each sequence and append them to the data
    for i in xrange(len(data)):        
        seq_features = " ".join(str(e) for e in compute_features(data[i].split()[0]))
        temp_amylnset.write(data[i] + " " + seq_features + "\n")
    temp_amylnset.close()
    
    # Create the .csv file of the sorted scores of features if it does not exist
    if not os.path.exists("data/temp/amylnset_feature_dataframe.csv"):
        import optimal_feature_selection as ofs
        ofs.select_optimal_features(6, "amylnset")
        
    # Creating dataset with optimal features.
    amylnset = open("data/temp/amyl"+str(n)+"set.txt", "w")
    feature_dataframe = pd.read_csv("data/temp/amylnset_feature_dataframe.csv", 
                                        index_col=0, header=0)
    feature_ids = [x for x in feature_dataframe["id"]]
    feature_ids.extend(range(len(feature_ids), len(feature_ids)+n))
        
    # Compute the features for each sequence and append them to the data
    for i in xrange(len(data)):        
        seq_features = " ".join(str(e) for e in compute_features(data[i].split()[0],
                                                                    feature_ids))
        amylnset.write(data[i] + " " + seq_features + "\n")