Beispiel #1
0
    def test_validate(self):
        # Good data
        try:
            result = airr.validate_rearrangement(self.data_good)
            self.assertTrue(result, 'validate(): good data failed')
        except:
            self.assertTrue(False, 'validate(): good data failed')

        # Bad data
        try:

            result = airr.validate_rearrangement(self.data_bad)
            self.assertFalse(result, 'validate(): bad data failed')
        except:
            pass
    def test_validate(self):
        # Good data
        try:
            result = airr.validate_rearrangement(self.data_good)
            self.assertTrue(result, 'validate(): good data failed')
        except:
            self.assertTrue(False, 'validate(): good data failed')

        # Bad data
        try:
            result = airr.validate_rearrangement(self.data_bad)
            self.assertFalse(result, 'validate(): bad data failed')
        except Exception as inst:
            print(type(inst))
            raise inst
Beispiel #3
0
def format_data(args):
    airr.validate_rearrangement(args.rearrangements, True)
    reader = airr.read_rearrangement(args.rearrangements)
    empty_list = []

    # keep only the Junction, Vgene, Jgene and Repertoire ID columns
    keys = ["junction_aa", "v_call", "j_call", "junction", "repertoire_id"]
    for row in reader:
        empty_list.append({x: row[x] for x in keys})

    df = pd.DataFrame(empty_list)

    # replace cells without junction with Nan
    df["junction_aa"].replace("", np.nan, inplace=True)

    # delete lines with Nan
    df.dropna(subset=["junction_aa"], inplace=True)

    # delete lines with an X on the junction_aa
    df = df[~df.junction_aa.str.contains("X")]

    # delete lines where junction_aa doesn"t start with C
    df = df[df.junction_aa.str.startswith("C")]

    # delete lines where junction_aa doesn"t end with F or W
    df = df[df.junction_aa.str.endswith(("F", "W"))]

    # delete lines where the chain in v_call and j_call doesn"t match
    df = df[(df["v_call"].str[2] == df["j_call"].str[2])]

    # keep only one first Vgene when there are multiple in the column
    df["v_call"] = df.v_call.str.split(",", n=1, expand=True)[0]

    # remove allele information from v_call and keep only the gene information
    df["v_call"] = df.apply(lambda x: x["v_call"][:-3], axis=1)
    df["chain"] = df.apply(lambda x: x["v_call"][2], axis=1)

    return df
	
	

if __name__ == '__main__':
	
	arguments = docopt(__doc__)
			
	#load saved locus information
	prj_tree	= ProjectFolders(os.getcwd())
	prj_name	= fullpath2last_folder(prj_tree.home)
	
	arguments['--rearrangements'] = re.sub("<project>", prj_name, arguments["--rearrangements"])
	
	if not os.path.isfile(arguments['--rearrangements']):
		sys.exit("Cannot find rearrangments file %s" % arguments['--rearrangements'])
	elif not airr.validate_rearrangement(arguments['--rearrangements']):
		sys.exit("File %s is not in valid AIRR format." % arguments['--rearrangements'])

	if re.match( arguments['--save'], 'all', re.I ):
		arguments['--save'] = [ "canonical_pair", "heavy_only", "light_only", "possible_inclusion", "multi_light", "multi_heavy", "probable_multiplet" ]
	elif re.match( arguments['--save'],'good', re.I ):
		arguments['--save'] = [ "canonical_pair", "heavy_only", "light_only", "possible_inclusion" ]
	elif re.match( arguments['--save'],'paired', re.I ):
		arguments['--save'] = [ "canonical_pair" ]
	else:
		sys.exit( "%s is not a valid option for --save. Please select from 'all', 'good', or 'paired' only." % arguments['--save'] )
	
	#log command line
	logCmdLine(sys.argv)

		
Beispiel #5
0
def main():

    airrFile = airr.create_rearrangement(
        "%s/%s_rearrangements.tsv" % (prj_tree.tables, prj_name),
        fields=[
            'vj_in_frame', 'stop_codon', 'locus', 'c_call', 'junction_length',
            'source_file', 'source_id', 'length_raw', 'length_trimmed',
            'indels', 'status', 'blast_identity', 'cluster_count', 'v_identity'
        ])

    #try to vacuum up all possible raw sequences and hope it doesn't kill memory
    raw_seqs = defaultdict(dict)
    file_list = glob.glob("*.fa") + glob.glob("*.fas") + glob.glob(
        "*.fst") + glob.glob("*.fasta") + glob.glob("*.fna") + glob.glob(
            "*.fq") + glob.glob("*.fastq")
    for myseq, myqual, file_name in generate_read_fasta_folder(file_list):
        raw_seqs[file_name][myseq.seq_id] = myseq.seq

    #get trimmed sequences
    trim_seqs = load_fastas("%s/%s_allJ.fa" % (prj_tree.nt, prj_name))

    #get nt junctions
    junc_seqs = load_fastas("%s/%s_allCDR3.fa" % (prj_tree.nt, prj_name))

    #do the conversion
    with open("%s/%s_all_seq_stats.txt" % (prj_tree.tables, prj_name),
              "r") as handle:
        oldFile = csv.reader(handle, delimiter="\t")
        header = next(oldFile)
        for row in oldFile:
            if row[11] == "wrong_length":
                continue

            if row[1] not in raw_seqs:
                sys.stderr.write(
                    "Couldn't find raw sequence file %s, %s will be dropped from converted file.\n"
                    % (row[1], row[0]))
                continue
            elif row[2] not in raw_seqs[row[1]]:
                sys.stderr.write(
                    "Couldn't find raw sequence %s in file %s; %s will be dropped from converted file.\n"
                    % (row[2], row[1], row[0]))
                continue

            r = dict()

            r['sequence'] = raw_seqs[row[1]][row[2]]
            r['sequence_alignment'] = str(
                trim_seqs.get(row[0], SeqRecord(seq="")).seq)
            r['junction'] = str(junc_seqs.get(row[0], SeqRecord(seq="")).seq)

            r['sequence_id'] = row[0]
            r['source_file'] = row[1]
            r['source_id'] = row[2]
            r['length_raw'] = row[3]
            if not row[4] == "NA":
                r['length_trimmed'] = row[4]
            if not row[5] == "NA":
                r['v_call'] = row[5]
            if row[6] not in ["NA", "not_found"]:
                r['d_call'] = row[6]
            if not row[7] == "NA":
                r['j_call'] = row[7]
            if not row[9] == "NA":
                r['indels'] = row[9]
            if not row[10] == "NA":
                r['stop_codon'] = row[10]
            r['status'] = row[11]
            if not row[12] == "NA":
                r['blast_identity'] = "%.3f" % (
                    1 - float(re.sub("%", "", row[12])) / 100)
            if not row[13] == "NA":
                r['junction_length'] = int(row[13]) + 6
            if not row[15] == "NA":
                r['junction_aa'] = row[15]

            if len(row) > 15:
                if header[16] == "Unique":
                    if row[16] == "T":
                        r['status'] = "unique"
                        r['cluster_count'] = row[17]
                    if len(row) > 17 and not row[18] == "NA":
                        r['v_identity'] = "%.3f" % (
                            1 - float(re.sub("%", "", row[18])) / 100)
                elif header[16] == "V_div" and not row[16] == "NA":
                    r['v_identity'] = "%.3f" % (
                        1 - float(re.sub("%", "", row[16])) / 100)

            #figure out in-frame/productive
            if row[10] == "good":
                r['vj_in_frame'] = "T"
                r['productive'] = "T"
            elif row[10] == "stop":
                r['vj_in_frame'] = "T"
                r['productive'] = "F"
            elif row[10] == "nonproductive":
                r['vj_in_frame'] = "F"
                r['productive'] = "F"
            elif row[10] == "indel":
                r['productive'] = "F"

            #figure out locus
            if any(x in row[5] for x in
                   ["HV", "VH", "Vh", "vh", "heavy", "Heavy", "HEAVY"]):
                r['locus'] = "IGH"
            elif any(x in row[5] for x in
                     ["LV", "VL", "Vl", "vl", "lambda", "Lambda", "LAMBDA"]):
                r['locus'] = "IGL"
            elif any(x in row[5] for x in
                     ["KV", "VK", "Vk", "vk", "kappa", "Kappa", "KAPPA"]):
                r['locus'] = "IGK"

            airrFile.write(r)

    airrFile.close()
    valid = airr.validate_rearrangement("%s/%s_rearrangements.tsv" %
                                        (prj_tree.tables, prj_name))
    if not valid:
        sys.exit(
            "ERROR: something went wrong, %s/%s_rearrangements.tsv failed validation!"
            % (prj_tree.tables, prj_name))
Beispiel #6
0
def main():

	airrFile = airr.create_rearrangement( "%s/%s_rearrangements.tsv"%(prj_tree.tables, prj_name), fields=['vj_in_frame','stop_codon','locus','c_call','junction_length','source_file','source_id','length_raw','length_trimmed','indels','status','blast_identity','cluster_count','v_identity'])

	
	#try to vacuum up all possible raw sequences and hope it doesn't kill memory
	raw_seqs  = defaultdict( dict )
	file_list = glob.glob("*.fa") + glob.glob("*.fas") + glob.glob("*.fst") + glob.glob("*.fasta") + glob.glob("*.fna") + glob.glob("*.fq") + glob.glob("*.fastq")
	for myseq, myqual, file_name in generate_read_fasta_folder( file_list ):
		raw_seqs[file_name][myseq.seq_id] = myseq.seq


	#get trimmed sequences
	trim_seqs = load_fastas( "%s/%s_allJ.fa"%(prj_tree.nt, prj_name) )

	#get nt junctions
	junc_seqs = load_fastas( "%s/%s_allCDR3.fa"%(prj_tree.nt, prj_name) )


	#do the conversion
	with open( "%s/%s_all_seq_stats.txt"%(prj_tree.tables, prj_name), "r" ) as handle:
		oldFile = csv.reader( handle, delimiter="\t" )
		header = next(oldFile)
		for row in oldFile:
			if row[11] == "wrong_length":
				continue

			if row[1] not in raw_seqs:
				sys.stderr.write("Couldn't find raw sequence file %s, %s will be dropped from converted file.\n"%(row[1],row[0]))
				continue
			elif row[2] not in raw_seqs[row[1]]:
				sys.stderr.write("Couldn't find raw sequence %s in file %s; %s will be dropped from converted file.\n"%(row[2],row[1],row[0]))
				continue

			r = dict()

			r['sequence']		= raw_seqs[ row[1] ][ row[2] ]
			r['sequence_alignment'] = str( trim_seqs.get( row[0], SeqRecord(seq="") ).seq )
			r['junction']		= str( junc_seqs.get( row[0], SeqRecord(seq="") ).seq )
					
			r['sequence_id']	= row[0]
			r['source_file']	= row[1]
			r['source_id']		= row[2]
			r['length_raw']		= row[3]
			if not row[4] == "NA":
				r['length_trimmed']  = row[4]
			if not row[5] == "NA":
				r['v_call']	     = row[5]
			if row[6] not in ["NA", "not_found"]:
				r['d_call']	     = row[6]
			if not row[7] == "NA":
				r['j_call']	     = row[7]
			if not row[9] == "NA":
				r['indels']	     = row[9]
			if not row[10] == "NA":
				r['stop_codon']	     = row[10]
			r['status']		     = row[11]
			if not row[12] == "NA":
				r['blast_identity']  = "%.3f" % (  1 - float(re.sub("%","",row[12]))/100  )
			if not row[13] == "NA":
				r['junction_length'] = int(row[13])+6
			if not row[15] == "NA":
				r['junction_aa']     = row[15]

			if len(row)>15:
				if header[16]=="Unique":
					if row[16] == "T":
						r['status']	   = "unique"
						r['cluster_count'] = row[17]
					if len(row)>17 and not row[18]=="NA":
						r['v_identity']	   = "%.3f" % (	 1 - float(re.sub("%","",row[18]))/100	)
				elif header[16] == "V_div" and not row[16]=="NA":
					r['v_identity']		   = "%.3f" % (	 1 - float(re.sub("%","",row[16]))/100	)


			#figure out in-frame/productive
			if row[10] == "good":
				r['vj_in_frame'] = "T"
				r['productive']	 = "T"
			elif row[10] == "stop":
				r['vj_in_frame'] = "T"
				r['productive']	 = "F"
			elif row[10] == "nonproductive":
				r['vj_in_frame'] = "F"
				r['productive']	 = "F"
			elif row[10] == "indel":
				r['productive']	 = "F"

				
			#figure out locus
			if any( x in row[5] for x in ["HV", "VH", "Vh", "vh", "heavy", "Heavy", "HEAVY"] ):
				r['locus'] = "IGH"
			elif any( x in row[5] for x in ["LV", "VL", "Vl", "vl", "lambda", "Lambda", "LAMBDA"] ):
				r['locus'] = "IGL"
			elif any( x in row[5] for x in ["KV", "VK", "Vk", "vk", "kappa", "Kappa", "KAPPA"] ):
				r['locus'] = "IGK"

				
			airrFile.write(r)

	airrFile.close()
	valid = airr.validate_rearrangement( "%s/%s_rearrangements.tsv"%(prj_tree.tables, prj_name) )
	if not valid:
		sys.exit( "ERROR: something went wrong, %s/%s_rearrangements.tsv failed validation!"%(prj_tree.tables, prj_name) )
if __name__ == '__main__':

    arguments = docopt(__doc__)

    #load saved locus information
    prj_tree = ProjectFolders(os.getcwd())
    prj_name = fullpath2last_folder(prj_tree.home)

    arguments['--rearrangements'] = re.sub("<project>", prj_name,
                                           arguments["--rearrangements"])

    if not os.path.isfile(arguments['--rearrangements']):
        sys.exit("Cannot find rearrangments file %s" %
                 arguments['--rearrangements'])
    elif not airr.validate_rearrangement(arguments['--rearrangements']):
        sys.exit("File %s is not in valid AIRR format." %
                 arguments['--rearrangements'])

    if re.match(arguments['--save'], 'all', re.I):
        arguments['--save'] = [
            "canonical_pair", "heavy_only", "light_only", "possible_inclusion",
            "multi_light", "multi_heavy", "probable_multiplet"
        ]
    elif re.match(arguments['--save'], 'good', re.I):
        arguments['--save'] = [
            "canonical_pair", "heavy_only", "light_only", "possible_inclusion"
        ]
    elif re.match(arguments['--save'], 'paired', re.I):
        arguments['--save'] = ["canonical_pair"]
    else:
Beispiel #8
0
print(mored.external_fields)
for r in airr.read_rearrangement('my_data.tsv'):
    r['new_field'] = 'A'
    r['more_annotation'] = 'B'
    print(r)
    mored.write(r)
mored.close()

# validate rearrangements file
print('*****')
print('*****')
print('Validate rearrangements file.')
print('*****')
print('*****')
print('Validating more_data.tsv')
valid = airr.validate_rearrangement('more_data.tsv')
if valid:
    print('PASS: more_data.tsv passes validation.')
else:
    print('FAIL: more_data.tsv does not pass validation.')

# should fail validation due to missing required field
print('Validating bad_data.tsv')
valid = airr.validate_rearrangement('bad_data.tsv')
if not valid:
    print('PASS: bad_data.tsv fails validation.')
else:
    print('FAIL: bad_data.tsv passed validation.')

# merge rearrangements file
print('*****')