Exemple #1
0
def create_training_sets(args):

    # Load portcullis junctions into dataframe
    print("Loading input junctions ... ", end="", flush=True)
    original = DataFrame.from_csv(args.input, sep='\t', header=0)
    fieldnames = [key for key in dict(original.dtypes)]
    print("done.", len(original), "junctions loaded.")

    # Before we go further make sure we have a sufficent number of junctions to work with.  Minimum 1000.
    if len(original) < 500:
        raise ValueError("Not enough junctions to create training set")

    if args.genuine:
        glist = load_genuine(args.genuine)
        if len(glist) != len(original):
            raise ValueError(
                "Genuine list and input junctions do not contain the same number of elements.  Genuine:"
                + len(glist) + "; input:" + len(original))

        original["genuine"] = pd.Series(glist, index=original.index)

    print()
    print("Creating initial positive set for training")
    print("------------------------------------------")
    print()
    print(
        "Applying the following set of rule-based filters to create initial positive set."
    )
    for i, pjson in enumerate(args.pos_json, start=1):
        print("\t".join([str(i), pjson]))
    print()
    print("LAYER\t", end="")
    if args.genuine:
        print(Performance.longHeader())
    else:
        print("PASS\tFAIL")

    df = original.copy()  # Required for pandas eval
    pos_juncs = None

    # Run through layers of logic to get the positive set
    for i, json_file in enumerate(args.pos_json, start=1):

        print(str(i) + "\t", end="", flush=True)

        # Create pandas command
        pandas_cmd = json2pandas(open(json_file), fieldnames)

        # print(pandas_cmd)

        # Execute the pandas command, result should be a filtered dataframe
        pos_juncs = eval(pandas_cmd)

        nb_not_pos = len(original) - len(pos_juncs)

        if args.genuine:
            not_pos_juncs = original.reset_index().merge(
                pos_juncs, indicator=True, how='outer').set_index('index')
            not_pos_juncs = not_pos_juncs.loc[not_pos_juncs['_merge'] ==
                                              'left_only']
            del not_pos_juncs['_merge']
            print(calcPerformance(pos_juncs, not_pos_juncs).longStr())
        else:
            print(str(len(pos_juncs)) + "\t" + str(nb_not_pos))

        if args.save_layers:
            pos_juncs.to_csv(args.prefix + ".pos_layer_" + str(i) + ".tab",
                             sep='\t')

        # Check we have enough junctions left in positive se (100), if not then stop here
        if len(pos_juncs) <= 100:
            print("WARNING: We recommend at least 100 junctions in the positive set and this set of rules lowered " + \
               "the positive set to", len(pos_juncs), ".  Will not filter positive set further.", file=sys.stderr)
            pos_juncs = df.copy()  # Override previous filter
            break

        df = pos_juncs.copy()

    # Get L95 for intron sizes
    if len(pos_juncs) == 0:
        raise ValueError(
            "Can't build training sets, positive set filter left no junctions remaining."
        )

    pos_intron_sizes = pos_juncs["size"].tolist()
    pos_intron_sizes.sort(key=int)
    L95 = pos_intron_sizes[int(len(pos_intron_sizes) * 0.95)]
    pos_length_limit = int(L95 * 1.2)

    print("Intron size L95 =", L95,
          " positive set maximum intron size limit set to L95 x 1.2:",
          pos_length_limit)

    # Also save this to file as we'll need it back in the C program
    with open(args.prefix + ".L95_intron_size.txt", 'w') as l95out:
        print("Length of intron at 95th percentile", file=l95out)
        print(L95, file=l95out)

    if len(pos_juncs) > 100:
        pos_juncs = pos_juncs.loc[pos_juncs["size"] <= pos_length_limit]
        print("\t".join([
            str(x)
            for x in [i +
                      1, len(pos_juncs),
                      len(original) - len(pos_juncs)]
        ]))

        if args.save_layers:
            pos_juncs.to_csv(args.prefix + ".pos_layer_intronsize.tab",
                             sep='\t')

    print()
    print("Positive set contains:", len(pos_juncs), "junctions")
    print()
    print("Saving positive set to disk ... ", end="", flush=True)
    if args.genuine:
        del pos_juncs["genuine"]
    pos_file = args.prefix + ".pos.junctions.tab"
    pos_juncs.to_csv(pos_file, sep='\t')
    print("done. File saved to:", pos_file)

    not_pos_juncs = original.reset_index().merge(
        pos_juncs, indicator=True, how='outer').set_index('index')
    not_pos_juncs = not_pos_juncs.loc[not_pos_juncs['_merge'] == 'left_only']
    del not_pos_juncs['_merge']

    print(len(not_pos_juncs), "remaining for consideration as negative set")

    print()
    print("Creating initial negative set for training")
    print("------------------------------------------")
    print(
        "Applying a set of rule-based filters to create initial negative set.")
    for i, njson in enumerate(args.neg_json, start=1):
        print("\t".join([str(i), njson]))
    print()
    print("LAYER\t", end="")
    if args.genuine:
        print(Performance.longHeader())
    else:
        print("PASS\tFAIL")

    neg_juncs = []
    df = not_pos_juncs

    # Run through layers of logic to get the positive set
    for i, json_file in enumerate(args.neg_json, start=1):

        print(str(i) + "\t", end="", flush=True)

        # Create pandas command
        pandas_cmd = json2pandas(open(json_file), fieldnames)

        #print(pandas_cmd)

        # Execute the pandas command, result should be a filtered dataframe
        neg_juncs.append(eval(pandas_cmd))

        nb_not_neg = len(df) - len(neg_juncs[-1])

        if args.genuine:
            print(calcPerformance(neg_juncs[-1], df).longStr())
        else:
            print(str(len(neg_juncs[-1])) + "\t" + str(nb_not_neg))

        if args.save_layers:
            neg_juncs[-1].to_csv(args.prefix + ".neg_layer_" + str(i) + ".tab",
                                 sep='\t')

    neg_length_limit = int(L95 * 10)
    print("Intron size L95 =", L95,
          "negative set will use junctions with intron size over L95 x 10:",
          neg_length_limit)
    neg_juncs.append(df.loc[df["size"] > neg_length_limit])
    if args.genuine:
        print(str(i + 1) + "\t" + calcPerformance(neg_juncs[-1], df).longStr())
    else:
        print(
            str(i + 1) + "\t" + str(len(neg_juncs[-1])) + "\t" +
            str(len(df) - len(neg_juncs[-1])))

    if args.save_layers:
        neg_juncs[-1].to_csv(args.prefix + ".neg_layer_intronsize.tab",
                             sep='\t')

    neg_set = pd.concat(neg_juncs)

    print()
    print("Negative set contains:", len(neg_set), "junctions")

    if args.genuine:
        del neg_set["genuine"]

    print("Saving negative set to disk ... ", end="", flush=True)
    neg_file = args.prefix + ".neg.junctions.tab"
    neg_set.to_csv(neg_file, sep='\t')
    print("done. File saved to:", neg_file)

    if args.save_failed:
        print(
            "Creating file containing junctions not in positive or negative set ... ",
            end="",
            flush=True)
        training = pd.concat([pos_juncs, neg_set])
        remaining = original.reset_index().merge(
            training, indicator=True, how='outer').set_index('index')
        others = remaining.loc[remaining['_merge'] == 'left_only']
        other_file = args.prefix + ".others.tab"
        others.to_csv(other_file, sep='\t')
        print("done.  File saved to:", other_file)

    print()
    print("Final train set stats:")
    print(" - Positive set:", len(pos_juncs), "junctions.")
    print(" - Negative set:", len(neg_set), "junctions.")
    print(" - Others:",
          len(original) - len(pos_juncs) - len(neg_set), "junctions.")
    print()
Exemple #2
0
def main():
	parser = argparse.ArgumentParser("Script to produce a list labelling whether each entry in the portcullis tab input belong to the bed reference or not.  Please note that this will only work with reference files that have distinct entries.")
	parser.add_argument("input", help="The filtered bed file to assess")
	parser.add_argument("-r", "--reference", required=True, help="The input bed file prior to filtering")
	parser.add_argument("-l", "--labels", required=True, help="The labels which should match entries in reference")
	parser.add_argument("-o", "--output", default="bedref_out.labels", help="Output prefix for output files")
	args = parser.parse_args()

	# Load reference and labels, divide into tp and tn
	rp = set()
	rn = set()

	labs = open(args.labels, "r")
	refs = open(args.reference, "r")

	# read header line from bed file
	header = refs.readline()

	# Now we should have the same number of lines in both files
	line = 1
	while 1:
		ref_line = refs.readline()
		lab_line = labs.readline()
		if not ref_line and not lab_line: break
		if (lab_line and not ref_line) or (not lab_line and ref_line): print("ERROR: reference file and labels file have a different number of entries.", file=sys.stderr); exit(1)
		ref_line = ref_line.strip()
		lab_line = lab_line.strip()
		if lab_line == "1":
			rp.add(bed12.BedEntry.create_from_line(ref_line, False, False))
		elif lab_line == "0":
			rn.add(bed12.BedEntry.create_from_line(ref_line, False, False))
		else:
			print("ERROR: Label file contains an entry that is not either \"0\" or \"1\" at line:", line, file=sys.stderr); exit(1)
		line += 1
	labs.close()
	refs.close()

	print("Reference contains", len(rp), "positive and", len(rn), "negative entries.")

	p = set()
	with open(args.input) as f:

		# Skip header
		h = f.readline()

		for line in f:

			cleanline = line.strip()

			if not cleanline == "":
				p.add(bed12.BedEntry.create_from_line(line, False, False))

	print("Input contains", len(p), "entries")

	tp = p & rp
	fp = p & rn
	fn = rp - p
	tn = rn - p

	perf = Performance(tp = len(tp), fp = len(fp), fn = len(fn), tn = len(tn))
	print(Performance.longHeader())
	print(perf.longStr())