def make_alignment(self, source_alignment_path): # Make the alignment self.alignment = Alignment() self.alignment.read(source_alignment_path) # TODO REMOVE -- this should be part of the checking procedure # We start by copying the alignment self.alignment_path = os.path.join(the_config.start_tree_path, 'source.phy') if os.path.exists(self.alignment_path): # Make sure it is the same old_align = Alignment() old_align.read(self.alignment_path) if not old_align.same_as(self.alignment): log.error("""Alignment file has changed since previous run. You need to use the force-restart option.""") raise AnalysisError compare = lambda x, y: collections.Counter( x) == collections.Counter(y) if not compare(old_align.species, self.alignment.species): log.error( """Species names in alignment have changed since previous run. You need to use the force-restart option.""") raise AnalysisError else: self.alignment.write(self.alignment_path)
def build_expr(self, context, expr, filter=None, align=None): score_expr = LogitScore(expr) if align is not None: # we do not need add_filter because Alignment already handles it return Alignment(score_expr, align, filter=filter) else: return self.add_filter(ComparisonOp('>', score_expr, 0.5), filter)
def generate_xml_tree(self): """ Try to parse xml, generate tree with xml tags and then cast it to mainAligment object and Alignment :return: exception when file has't got correct content """ try: tree = et.parse(self.file) self.root = tree.getroot() self.blast_output = self.root[8] self.iteration = self.blast_output[0] self.iteration_hit = self.iteration[4] for i in self.iteration_hit: self.hits.append(i) for i in self.hits: h = [] for j in i: h.append(j) for hsp in h[5]: procent = "{0:.2f}".format( int(hsp[10].text) / int(hsp[13].text) * 100) procent = float(procent) self.aligns.append( Alignment(h[2].text, hsp[1].text, procent, hsp[12].text, hsp[10].text, hsp[13].text, hsp[14].text, hsp[15].text, hsp[16].text)) self.main_alignments.append( MainAlignment(i[2].text, self.aligns)) self.aligns = [] except IndexError: "Bad file."
def make_alignment(self, cfg, alignment): # Make an Alignment from the source, using this subset sub_alignment = SubsetAlignment(alignment, self) sub_path = os.path.join(cfg.phylofiles_path, self.name + '.phy') # Add it into the sub, so we keep it around self.alignment_path = sub_path # Maybe it is there already? if os.path.exists(sub_path): log.debug("Found existing alignment file %s", sub_path) old_align = Alignment() old_align.read(sub_path) # It had better be the same! if not old_align.same_as(sub_alignment): log.error( "It looks like you have changed one or more of the " "data_blocks in the configuration file, " "so the new subset alignments " "don't match the ones stored for this analysis. " "You'll need to run the program with --force-restart") raise SubsetError else: # We need to write it sub_alignment.write(sub_path)
def alignment(self): """Make self into an alignment, and return it. If all the sequences are the same length and type, then self, a sequenceList, could be an Alignment. This method generates an Alignment instance, runs the Alignment method checkLengthsAndTypes(), and returns the Alignment. If you feed p4 a fasta sequence, it makes SequenceList object, and runs this method on it. If it works then p4 puts the Alignment object in var.alignments, and if not it puts the SequenceList object in var.sequenceLists. It is possible that p4 might think that some short sequences are DNA when they are really protein. In that case it will fail to make an alignment, because it will fail the types check. So what you can do is something like this:: sl = var.sequenceLists[0] for s in sl.sequences: s.dataType = 'protein' a = sl.alignment() """ from alignment import Alignment a = Alignment() a.fName = self.fName import copy a.sequences = copy.deepcopy(self.sequences) # self will be deleted a.fName = self.fName a.checkLengthsAndTypes() return a
def permuted_copy(self, partition=None): """ Return a copy of the collection with all alignment columns permuted """ def take(n, iterable): return [iterable.next() for _ in range(n)] if partition is None: partition = Partition([1] * len(self)) index_tuples = partition.get_membership() alignments = [] for ix in index_tuples: concat = Concatenation(self, ix) sites = concat.alignment.get_sites() random.shuffle(sites) d = dict( zip(concat.alignment.get_names(), [iter(x) for x in zip(*sites)])) new_seqs = [[(k, ''.join(take(l, d[k]))) for k in d] for l in concat.lengths] for seqs, datatype, name in zip(new_seqs, concat.datatypes, concat.names): alignment = Alignment(seqs, datatype) alignment.name = name alignments.append(alignment) return self.__class__( records=sorted(alignments, key=lambda x: SORT_KEY(x.name)))
def main(): args = args_init(vars(get_args()), align=True) # save as dictionary # log.info('aaaaa') # args['align_to_te'] = True ## run alignment map_bam_list = Alignment(**args).run()
def execute(self): # Alignment # TODO: choose mode automatically msa = Alignment(messages=self.messages, output_dir=self.output_dir, mode=self.mode, multithread=self.multithread) #msa = Alignment(messages=self.messages, output_dir=self.output_dir, multithread=True) msa.execute() # exit() # Generate fields filepath_fields_info = os.path.join(self.output_dir, Alignment.FILENAME_FIELDS_INFO) self.fields, fid_list = self.generate_fields_by_fieldsinfo( filepath_fields_info) logging.debug("Number of keyword candidates: {}\nfid: {}".format( len(fid_list), fid_list)) # Compute probabilities of observation constraints constraint = Constraint(messages=self.messages, direction_list=self.direction_list, fields=self.fields, fid_list=fid_list, output_dir=self.output_dir) pairs_p, pairs_size = constraint.compute_observation_probabilities() pairs_p_request, pairs_p_response = pairs_p pairs_size_request, pairs_size_response = pairs_size constraint.save_observation_probabilities(pairs_p_request, pairs_size_request, Constraint.TEST_TYPE_REQUEST) constraint.save_observation_probabilities( pairs_p_response, pairs_size_response, Constraint.TEST_TYPE_RESPONSE) # pairs_p_request, pairs_size_request = constraint.load_observation_probabilities(Constraint.TEST_TYPE_REQUEST) # pairs_p_response, pairs_size_response = constraint.load_observation_probabilities(Constraint.TEST_TYPE_RESPONSE) # print(pairs_p_request, pairs_size_request) # print(pairs_p_response, pairs_size_response) # Probabilistic inference pairs_p_all, pairs_size_all = self.merge_constraint_results( pairs_p_request, pairs_p_response, pairs_size_request, pairs_size_response) ffid_list = ["{0}-{0}".format(fid) for fid in fid_list] #only test same fid for both sides pi = ProbabilisticInference(pairs_p=pairs_p_request, pairs_size=pairs_size_request) fid_inferred = pi.execute(ffid_list) ## TODO: iterative ## TODO: format inference return fid_inferred
def get_results(self): if self.file_read_job == None: return self.results else: # self.results=read_internal_alignment(self.alignedfn,) alignment = Alignment() alignment.datatype = self.datatype alignment.read_filepath(self.alignedfn, file_format='FASTA') self.results = alignment return self.results
def make_alignment(self, source_alignment_path): # Make the alignment self.alignment = Alignment() self.alignment.read(source_alignment_path) # We start by copying the alignment self.alignment_path = os.path.join(self.cfg.start_tree_path, 'source.phy') if os.path.exists(self.alignment_path): # Make sure it is the same old_align = Alignment() old_align.read(self.alignment_path) if not old_align.same_as(self.alignment): log.error( "Alignment file has changed since previous run. You need to use the force-restart option." ) raise AnalysisError else: self.alignment.write(self.alignment_path)
def __init__(self, gui, parent=None): """ Establish the connection with the main gui, set some instance variables and initialize all flags to False. :param gui: main gui object """ QtCore.QThread.__init__(self, parent) self.gui = gui # Create the alignment object. Alignment points are kept throughout the whole program # execution, even if the telescope driver or other configuration parameters are changed. self.al = Alignment(self.gui.configuration, debug=self.gui.configuration.alignment_debug) self.exiting = False self.output_channel_initialization_flag = False self.telescope_initialization_flag = False self.camera_initialization_flag = False self.new_tesselation_flag = False self.slew_to_alignment_point_flag = False self.perform_alignment_flag = False self.perform_autoalignment_flag = False self.slew_to_moon_limb_flag = False self.set_focus_area_flag = False self.goto_focus_area_flag = False self.slew_to_tile_and_record_flag = False self.move_to_selected_tile_flag = False self.escape_pressed_flag = False # Save the descriptor of standard output. Stdout might be redirected to a file and back # later. self.stdout_saved = sys.stdout # Initialize status variables. self.output_redirected = False self.telescope_connected = False self.camera_connected = False self.tesselation_created = False # Initialize some instance variables. self.active_tile_number = -1 self.all_tiles_recorded = False self.protocol_file = None self.telescope = None self.camera = None self.date_time = None self.me = None self.tc = None self.repeat_from_here = None self.tile_indices_since_last_autoalign = None self.start()
def te_aligner(fq1_files, smp_name, args, fq2_files=None): """Mapping reads to genome control or treatment args dict, the arguments of pipeline check index 1. rRNA 2. genome 3. spike-in-rRNA 4. spike-in """ project_path = init_rnaseq_project(args['path_out'], analysis_type=1) te_align_path = project_path['transposon'] args['extra_index'] = None # pre-build # ## qc-report # qc_path = os.path.join(te_align_path['report'], 'qc') # QC_reporter(fq1_files, qc_path).run() ## skip, run in gene_aligner ## update args args['fq1'] = fq1_files args['fq2'] = fq2_files args['path_out'] = te_align_path['mapping'] args['smp_name'] = smp_name args['align_to_te'] = True # extra small genome small_genome = args['small_genome'] args['small_genome'] = True ## run alignment map_bam_list = Alignment(**args).run() map_bam = [item for sublist in map_bam_list for item in sublist] # create bigWig files # for bam in map_bam: # bam2bigwig( # bam=bam, # genome=args['genome'], # path_out=te_align_path['bigWig'], # strandness=args['s'], # binsize=args['bin_size'], # overwrite=args['overwrite']) ## return args['small_genome'] = small_genome return map_bam
def __init__(self, line_string): self.type = self.TYPE_HEADER if line_string.startswith('@') \ else self.TYPE_ALIGNMENT if self.type == self.TYPE_HEADER: self.fields = [line_string] return self.fields = line_string.split() pos, cigar = self.fields[3], self.fields[5] if cigar == '*': raise CigarUnavailableError md = next(filter(lambda field: field.startswith('MD:Z:'), self.fields)) md = md.replace('MD:Z:', '') self.alignment = Alignment(pos, cigar, md)
def gene_aligner(fq1_files, smp_name, args, fq2_files=None): """Mapping reads to genome control or treatment args dict, the arguments of pipeline check index 1. rRNA 2. genome 3. spike-in-rRNA 4. spike-in """ project_path = init_rnaseq_project(args['path_out'], analysis_type=1) gene_align_path = project_path['gene'] ## qc-report qc_path = os.path.join(gene_align_path['report'], 'qc') # QC_reporter(fq1_files, qc_path).run() ## update args args['fq1'] = fq1_files args['fq2'] = fq2_files args['path_out'] = gene_align_path['mapping'] args['smp_name'] = smp_name args['align_to_te'] = False ## run alignment map_bam_list = Alignment(**args).run() ## filt map_genome map_bam = [] for i in map_bam_list: for k in i: if k.endswith('map_' + args['genome'] + '.bam'): map_bam.append(k) # # create bigWig files # for bam in map_bam: # bam2bigwig( # bam=bam, # genome=args['genome'], # path_out=gene_align_path['bigWig'], # strandness=args['s'], # binsize=args['bin_size'], # overwrite=args['overwrite']) return map_bam
def read_internal_alignment(fn, file_format='FASTA', datatype=None, dirs_to_delete=(), temp_fs=None): alignment = Alignment() alignment.datatype = datatype alignment.read_filepath(fn, file_format=file_format) if len(alignment) >= 1: if dirs_to_delete: assert (temp_fs) for d in dirs_to_delete: time.sleep(.1) #TODO: not sure why this is here! temp_fs.remove_dir(d) return alignment else: raise ValueError( "The alignment file %s has no sequences. PASTA quits." % fn)
def solveAlignment(method, fileName): alignment = Alignment(fileName) # Se crea el archivo alignment.readFile() # Se lee el archivo con la información if method == '1': # Se elige fuerza bruta # start = datetime.now() result, result1, result2 = alignment.bruteForceSolving() # Se resuelve alignment.printBruteForce(result, result1, result2) # Se imprime los resultados # print(datetime.now() - start) elif method == '2': start = datetime.now() matrix, moves, result, result1, result2 = alignment.dynamicSolving( ) # Se resuelve alignment.printDynamic(matrix, moves, result, result1, result2) # Se imprime los resultados # print(datetime.now() - start) else: error( "Error, revise que utilice los parametros correctos. \n Utilize [-h] para ayuda." )
def simulate(self, partition, outdir, batchsize=1, **kwargs): """ Simulate a set of alignments from the parameters inferred on a partition :param partition: :return: """ indices = partition.get_membership() self.add_lnl_partitions(partition, **kwargs) results = [self.lnl_cache[ix] for ix in indices] places = dict((j, i) for (i, j) in enumerate( rec.name for rec in self.collection.records)) # Collect argument list args = [None] * len(self.collection) for result in results: for partition in result['partitions'].values(): place = places[partition['name']] args[place] = (len(self.collection[place]), model_translate(partition['model']), partition['frequencies'], partition['alpha'], result['ml_tree'], partition['rates'] if 'rates' in partition else None) # Distribute work msg = 'Simulating' client = get_client() if client is None: map_result = sequential_map(client, tasks.simulate_task, args, msg) else: map_result = parallel_map(client, tasks.simulate_task, args, msg, batchsize, background) if background: return map_result # Process results for i, result in enumerate(map_result): orig = self.collection[i] simseqs = gapmask(result, orig.get_sequences()) al = Alignment(simseqs, 'protein' if orig.is_protein() else 'dna') outfile = os.path.join(outdir, orig.name + '.phy') al.write_alignment(outfile, 'phylip', True)
def make_alignment(self, cfg, alignment): # Make an Alignment from the source, using this subset sub_alignment = SubsetAlignment(alignment, self) sub_path = os.path.join(cfg.phylofiles_path, self.subset_id + '.phy') # Add it into the sub, so we keep it around self.alignment_path = sub_path # Maybe it is there already? if os.path.exists(sub_path): log.debug("Found existing alignment file %s" % sub_path) old_align = Alignment() old_align.read(sub_path) # It had better be the same! if not old_align.same_as(sub_alignment): log.error(self.FORCE_RESTART_MESSAGE) raise SubsetError else: # We need to write it sub_alignment.write(sub_path)
def extra_aligner(fq1_files, smp_name, args, fq2_files=None): """Mapping reads to genome control or treatment args dict, the arguments of pipeline check index 1. rRNA 2. genome 3. spike-in-rRNA 4. spike-in """ project_path = init_rnaseq_project(args['path_out'], analysis_type=1) extra_align_path = project_path['extra'] ## qc-report qc_path = os.path.join(extra_align_path['report'], 'qc') # QC_reporter(fq1_files, qc_path).run() ## update args args['fq1'] = fq1_files args['fq2'] = fq2_files args['path_out'] = extra_align_path['mapping'] args['smp_name'] = smp_name args['align_to_te'] = False # extra small genome, for STAR small_genome = args['small_genome'] args['small_genome'] = True ## run alignment map_bam = Alignment(**args).run() ## return args['small_genome'] = small_genome ## return return map_bam
def pre_process(optmap_i, optmap_file, myfile, myfile2, output_dir, min_confidence): header_lines = 10 header = [] minrefoverhang = 50000 minqryoverhang = 50000 all_alms = { } # stores all the Alignments for all groups, all_groups[ref] should contain molecule ref qualify_alms = { } # only keep one alignment(the one with highest confidence) for each contig in one molecule removed = { } # removed[ref,qry] == True means alignment for (ref, qry) is already removed # collecting alignments and store in all_groups print '---------------read .xmap file-------------------' with open(myfile + '_flip.xmap', 'rb') as csvfile: csvreader = csv.reader(csvfile, delimiter='\t') for i in range(header_lines): # 10 lines of header header.append(csvreader.next()) # save them # read the first non-header line while True: try: row = csvreader.next() x = Alignment(int(row[1]), int(row[2]), float(row[3]), float(row[4]), float(row[5]), float(row[6]), row[7], float(row[8]), row[9], float(row[10]), float(row[11]), int(row[12]), row[13]) if x.ref not in all_alms: all_alms[x.ref] = [x] else: all_alms[x.ref].append(x) except StopIteration: break num_all_alms = 0 for ref in all_alms: num_all_alms += len(all_alms[ref]) print "In total, the number of alignments collected is ", num_all_alms # only keep one alignment(the one with highest confidence) for each contig in one molecule for ref in all_alms: group = all_alms[ref] qry_bestx = {} for x in group: if x.qry not in qry_bestx: qry_bestx[x.qry] = x else: if x.confidence > qry_bestx[x.qry].confidence: qry_bestx[x.qry] = x qualify_alms[ref] = {} for qry in qry_bestx: qualify_alms[ref][qry] = qry_bestx[qry] num_qualify_alms = 0 for ref in qualify_alms: num_qualify_alms += len(qualify_alms[ref]) # initialize removed array for ref in qualify_alms: for qry in qualify_alms[ref]: removed[ref, qry] = False current_alms = copy_alms(qualify_alms, removed) output_alms(current_alms, output_dir + "/opt_" + str(optmap_i) + "_alms_0_initial.log") print "In total, the number of alignments in qualify_alms is ", num_qualify_alms # remove low confidence alignments print '---------------Remove low quality alignments---------------' for ref in qualify_alms: for qry in qualify_alms[ref]: x = qualify_alms[ref][qry] if x.confidence < min_confidence: removed[ref, qry] = True print 'alignment (', ref, ',', qry, ') is low quality and removed' num_alms = 0 for ref in qualify_alms: for qry in qualify_alms[ref]: if removed[ref, qry] == False: num_alms += 1 current_alms = copy_alms(qualify_alms, removed) output_alms( current_alms, output_dir + "/opt_" + str(optmap_i) + "_alms_1_removed_low_conf.log") print "After removing low confidence alignments, the number of alignments is ", num_alms print '---------------End---------------' # read optical map optmap = {} with open(optmap_file) as f_map: for line in f_map: line = line.strip() if line[0] == '#': continue cols = line.split('\t') CMapId = int(cols[0]) LabelChannel = cols[4] Position = float(cols[5]) if CMapId not in optmap: optmap[CMapId] = [] if LabelChannel == "1": optmap[CMapId].append(Position) for CMapId in optmap: optmap[CMapId].sort() print '---------------scaling-------------------' # calculating scaling qry_len = {} with open(myfile2 + '_key.txt') as f_key: for i in range(0, 4): # 4 header lines f_key.readline() for line in f_key: line = line.strip() cols = line.split('\t') qry_id = int(cols[0]) seq_len = int(cols[2]) qry_len[qry_id] = seq_len scaling = 0 num = 0 with open(myfile + '_r.cmap') as f_q: for i in range(0, 11): # 11 header lines f_q.readline() for line in f_q: line = line.strip() cols = line.split('\t') qry_id = int(cols[0]) appr_len = float(cols[1]) seq_len = qry_len[qry_id] scaling += appr_len / seq_len num += 1 scaling /= num # scaling=1.02258059775 scaling = 1.0 # use scaling to adjsut coordinates of alignments for ref in qualify_alms: for qry in qualify_alms[ref]: x = qualify_alms[ref][qry] x.qrystartpos /= scaling x.qryendpos /= scaling x.qrylen /= scaling x.refstartpos /= scaling x.refendpos /= scaling x.reflen /= scaling # use scaling to adjsut coordinates of optial map for ref in optmap: for i in range(0, len(optmap[ref])): optmap[ref][i] /= scaling print '---------------END-------------------' # find the reference-based coordinates for each contig for ref in qualify_alms: for qry in qualify_alms[ref]: x = qualify_alms[ref][qry] if (x.orientation == '+'): x.qry_left_overlen = x.qrystartpos x.qry_right_overlen = x.qrylen - x.qryendpos else: x.qry_left_overlen = x.qrylen - x.qrystartpos x.qry_right_overlen = x.qryendpos x.start = x.refstartpos - x.qry_left_overlen x.end = x.refendpos + x.qry_right_overlen x.ref_left_overlen = x.refstartpos x.ref_right_overlen = x.reflen - x.refendpos if (x.orientation == '+'): x.refstart = x.qrystartpos - x.ref_left_overlen x.refend = x.qryendpos + x.ref_right_overlen else: x.refstart = x.qryendpos - x.ref_right_overlen x.refend = x.qrystartpos + x.ref_left_overlen num_alms = 0 for ref in qualify_alms: for qry in qualify_alms[ref]: if removed[ref, qry] == False: num_alms += 1 current_alms = copy_alms(qualify_alms, removed) output_alms(current_alms, output_dir + "/opt_" + str(optmap_i) + "_alms_2_scaled.log") print "After scaling, the number of alignments is ", num_alms # read qry map qry_markers = {} with open(myfile + '_r.cmap') as f_q: for i in range(11): # 10 lines of header header_line = f_q.readline() for line in f_q: line = line.strip() cols = line.split('\t') CMapId = int(cols[0]) ContigLength = float(cols[1]) NumSites = int(cols[2]) SiteID = int(cols[3]) LabelChannel = cols[4] Position = float(cols[5]) if LabelChannel == "0": continue if CMapId not in qry_markers: qry_markers[CMapId] = [] Position /= scaling qry_markers[CMapId].append(Position) for CMapId in qry_markers: qry_markers[CMapId].sort() f_q.close() print '---------------candidate cutting sites-------------------' fpair = file(output_dir + "/chimeric_pairs_" + str(optmap_i) + ".log", 'w') fpair.write("ref_id\tref_pos\tqry_id\tqry_pos\n") chimeric_pairs = [] for ref in qualify_alms: for qry in qualify_alms[ref]: if removed[ref, qry] == True: continue x = qualify_alms[ref][qry] if (x.confidence > min_confidence): ref_left_overlen = x.refstartpos ref_right_overlen = x.reflen - x.refendpos flag_left = False flag_right = False if (x.qry_left_overlen > minqryoverhang and ref_left_overlen > minrefoverhang and markers_in_qry_left_overhang(qry_markers, x) > 0): flag_left = True chimeric_pairs.append( (x.ref, x.refstartpos, x.qry, x.qrystartpos)) print( x.ref, x.refstartpos, x.qry, x.qrystartpos), "is a pair of candidate cutting sites" fpair.write( str(x.ref) + "\t" + str(x.refstartpos) + "\t" + str(x.qry) + "\t" + str(x.qrystartpos) + "\n") if (x.qry_right_overlen > minqryoverhang and ref_right_overlen > minrefoverhang and markers_in_qry_right_overhang(qry_markers, x) > 0): flag_right = True chimeric_pairs.append( (x.ref, x.refendpos, x.qry, x.qryendpos)) print(x.ref, x.refendpos, x.qry, x.qryendpos), "is a pair of candidate cutting sites" fpair.write( str(x.ref) + "\t" + str(x.refendpos) + "\t" + str(x.qry) + "\t" + str(x.qryendpos) + "\n") if flag_left == True and flag_right == True: removed[ref, qry] = True fpair.close() print '---------------END-------------------' num_alms = 0 for ref in qualify_alms: for qry in qualify_alms[ref]: if removed[ref, qry] == False: num_alms += 1 current_alms = copy_alms(qualify_alms, removed) output_alms( current_alms, output_dir + "/opt_" + str(optmap_i) + "_alms_3_removed_both_overhang.log") print "After removing alignments with both overhangs, the number of alignments is ", num_alms # check overlap between alignments for r in qualify_alms: for q1 in qualify_alms[r]: if removed[r, q1] == True: continue x = qualify_alms[r][q1] for q2 in qualify_alms[r]: if removed[r, q2] == True: continue y = qualify_alms[r][q2] if q1 >= q2: continue if x.refstartpos <= y.refstartpos and y.refstartpos <= x.refendpos: overlap = min(x.refendpos, y.refendpos) - y.refstartpos elif y.refstartpos <= x.refstartpos and x.refstartpos <= y.refendpos: overlap = min(x.refendpos, y.refendpos) - x.refstartpos else: overlap = 0 if overlap >= 20000: if x.confidence < y.confidence: removed[r, q1] = True else: removed[r, q2] = True num_alms = 0 for ref in qualify_alms: for qry in qualify_alms[ref]: if removed[ref, qry] == False: num_alms += 1 current_alms = copy_alms(qualify_alms, removed) output_alms( current_alms, output_dir + "/opt_" + str(optmap_i) + "_alms_4_solved_overlaps.log") print "After removing one of two overlap alignments, the number of alignments is ", num_alms return current_alms, optmap, chimeric_pairs
def mtp(myfile, myfile2, output_dir, GLPSOL, false_alm_threshold, min_confidence): # discard alignments below min_confidence #min_confidence = 25 header_lines = 10 header = [] # alignment overhangs above this number of bps are considered chimeric #minrefoverhang = 100000 #minqryoverhang = 100000 all_alms = { } # stores all the Alignments for all groups, all_groups[ref] should contain molecule ref qualify_alms = { } # only keep one alignment(the one with highest confidence) for each contig in one molecule removed = { } # removed[ref,qry] == True means alignment for (ref, qry) is already removed # collecting alignments and store in all_groups print '---------------read .xmap file-------------------' with open(myfile + '.xmap', 'rb') as csvfile: csvreader = csv.reader(csvfile, delimiter='\t') for i in range(header_lines): # 10 lines of header header.append(csvreader.next()) # save them # read the first non-header line while True: try: row = csvreader.next() x = Alignment(int(row[1]), int(row[2]), float(row[3]), float(row[4]), float(row[5]), float(row[6]), row[7], float(row[8]), row[9], float(row[10]), float(row[11]), int(row[12]), row[13]) if x.ref not in all_alms: all_alms[x.ref] = [x] else: all_alms[x.ref].append(x) except StopIteration: break num_all_alms = 0 for ref in all_alms: #print 'collected', len(all_alms[ref]), 'alignments for molecule', ref num_all_alms += len(all_alms[ref]) print "In total, the number of alignments collected is ", num_all_alms # only keep one alignment(the one with highest confidence) for each contig in one molecule for ref in all_alms: group = all_alms[ref] qry_bestx = {} for x in group: if x.qry not in qry_bestx: qry_bestx[x.qry] = x else: if x.confidence > qry_bestx[x.qry].confidence: qry_bestx[x.qry] = x qualify_alms[ref] = {} for qry in qry_bestx: qualify_alms[ref][qry] = qry_bestx[qry] num_qualify_alms = 0 for ref in qualify_alms: num_qualify_alms += len(qualify_alms[ref]) print "In total, the number of alignments in qualify_alms is ", num_qualify_alms # initialize removed array for ref in qualify_alms: for qry in qualify_alms[ref]: removed[ref, qry] = False # find the reference-based coordinates for each alignments for ref in qualify_alms: for qry in qualify_alms[ref]: x = qualify_alms[ref][qry] if (x.orientation == '+'): x.qry_left_overlen = x.qrystartpos x.qry_right_overlen = x.qrylen - x.qryendpos else: x.qry_left_overlen = x.qrylen - x.qrystartpos x.qry_right_overlen = x.qryendpos x.start = x.refstartpos - x.qry_left_overlen x.end = x.refendpos + x.qry_right_overlen current_alms = copy_alms(qualify_alms, removed) output_alms(current_alms, output_dir + "/alms_0_initial.log") print "Initially, the number of alignments is", count_alms(current_alms) alms_0 = copy_alms(qualify_alms, removed) aligned_contigs = different_contigs(alms_0, {}) output_contigs(aligned_contigs, myfile + '_aligned.txt') print '---------------END-------------------' # remove low confidence alignments print '---------------Remove low quality alignments---------------' for ref in qualify_alms: for qry in qualify_alms[ref]: x = qualify_alms[ref][qry] if x.confidence < min_confidence: removed[ref, qry] = True print 'alignment (', ref, ',', qry, ') is low quality and removed' num_alms = 0 for ref in qualify_alms: for qry in qualify_alms[ref]: if removed[ref, qry] == False: num_alms += 1 current_alms = copy_alms(qualify_alms, removed) output_alms(current_alms, output_dir + "/alms_1_removed_lowconf.log") print "After removing low confidence alignments, the number of alignments is", count_alms( current_alms) alms_1 = copy_alms(qualify_alms, removed) lowconf_contigs = different_contigs(alms_0, alms_1) output_contigs(lowconf_contigs, myfile + '_lowconf.txt') print '---------------End---------------' print '---------------removing false positive alignments-------------------' current_alms = copy_alms(qualify_alms, removed) false_alms(GLPSOL, false_alm_threshold, current_alms, removed, output_dir) current_alms = copy_alms(qualify_alms, removed) output_alms(current_alms, output_dir + "/alms_2_removed_false_alms.log") print "After removing false positive alignments, the number of alignments is", count_alms( current_alms) print '---------------END-------------------' print '---------------removing contained contigs locally-------------------' for ref in qualify_alms: for q1 in qualify_alms[ref]: x = qualify_alms[ref][q1] for q2 in qualify_alms[ref]: if q2 <= q1: continue y = qualify_alms[ref][q2] if (x.start >= y.start) and (x.end <= y.end): removed[ref, q1] = True print[ ref, q1 ], "alignment is removed becasue it's contained in alignment", [ ref, q2 ] elif (y.start >= x.start) and (y.end <= x.end): removed[ref, q2] = True print[ ref, q2 ], "alignment is removed becasue it's contained in alignment", [ ref, q1 ] current_alms = copy_alms(qualify_alms, removed) output_alms(current_alms, output_dir + "/alms_3_removed_contained_locally.log") print "After removing contained alignments locally, the number of alignments is", count_alms( current_alms) print '---------------END-------------------' #build the mst print '---------------building the mst-------------------' fo = file(output_dir + "/ugraph_1.log", 'w') current_alms = copy_alms(qualify_alms, removed) forest, vertex_orientations = get_mst(current_alms, fo) fo.close() output_forest(forest, vertex_orientations, output_dir + "/forest_1.log") print '---------------END-------------------' # unify the coordinates print '---------------unifying the coordinates-------------------' current_alms = copy_alms(qualify_alms, removed) unify_alms = unify_coords(output_dir, current_alms, forest, vertex_orientations) removed_unify = {} for root in unify_alms: for qry in unify_alms[root]: removed_unify[root, qry] = False contigs = set([]) for root in unify_alms: for qry in unify_alms[root]: if qry in contigs: print qry, "appears in more than 1 trees" contigs.add(qry) current_alms = copy_alms(unify_alms, removed_unify) output_alms(current_alms, output_dir + "/alms_4_unified.log") print "After unifying the coordinates, the number of alignments is", count_alms( current_alms) print '---------------END-------------------' print '---------------removing contained contigs globally-------------------' contained = set([]) for root in unify_alms: for q1 in unify_alms[root]: x = unify_alms[root][q1] for q2 in unify_alms[root]: if q2 <= q1: continue y = unify_alms[root][q2] if (q2 not in contained) and (x.start >= y.start) and (x.end <= y.end): contained.add(q1) removed_unify[root, q1] = True print[ root, q1 ], "alignment is removed becasue it's contained in alignment", [ root, q2 ] elif (q1 not in contained) and (y.start >= x.start) and (y.end <= x.end): contained.add(q2) removed_unify[root, q2] = True print[ root, q2 ], "alignment is removed becasue it's contained in alignment", [ root, q1 ] for root in unify_alms: for qry in unify_alms[root]: if qry in contained and removed_unify[root, qry] == False: removed_unify[root, qry] = True print[root, qry ], "alignment is removed because qry is contained contig" current_alms = copy_alms(unify_alms, removed_unify) output_alms(current_alms, output_dir + "/alms_5_removed_contained_globally.log") print "After removing contained contigs globally, the number of alignments is", count_alms( current_alms) print '---------------END-------------------' #build new mst print '---------------building new mst-------------------' fo = file(output_dir + "/ugraph_2.log", 'w') current_alms = copy_alms(unify_alms, removed_unify) forest_unify, vertex_orientations_unify = get_mst(current_alms, fo) fo.close() output_forest(forest_unify, vertex_orientations_unify, output_dir + "/forest_2.log") print '---------------END-------------------' print '---------------merging DAGs-------------------' current_alms = copy_alms(unify_alms, removed_unify) DAGs = merge_DAGs(current_alms, forest_unify, vertex_orientations_unify) output_DAGs(DAGs, output_dir + "/dags.log") print '---------------END-------------------' #DAG to mtp contig set print '---------------mtp-------------------' mtp_node_set = get_subDAGs(DAGs, output_dir) current_alms = copy_alms(unify_alms, removed_unify) mtp = [] for ref in current_alms: for qry in current_alms[ref]: x = current_alms[ref][qry] if qry in mtp_node_set: mtp.append(x) else: removed_unify[ref, qry] = True mtp.sort(key=lambda x: (x.ref, x.start)) print "In total, the number of alignments in mtp is", len(mtp) current_alms = copy_alms(unify_alms, removed_unify) output_alms(current_alms, output_dir + "/alms_6_mtp.log") print '---------------END-------------------' print '---------------scaling-------------------' # calculating scaling qry_len = {} with open(myfile2 + '_key.txt') as f_key: for i in range(0, 4): # 4 header lines f_key.readline() for line in f_key: line = line.strip() cols = line.split('\t') qry_id = int(cols[0]) seq_len = int(cols[2]) qry_len[qry_id] = seq_len scaling = 0 num = 0 with open(myfile + '_q.cmap') as f_q: for i in range(0, 11): # 11 header lines f_q.readline() for line in f_q: line = line.strip() cols = line.split('\t') qry_id = int(cols[0]) appr_len = float(cols[1]) seq_len = qry_len[qry_id] scaling += appr_len / seq_len num += 1 scaling /= num # scaling=1.02258059775 print '---------------outputing-------------------' # save the MTP in a new xmap file and count the number of unitigs in each assembly with open(myfile + '_list.txt', 'wb') as listfile: with open(myfile + '_mtp.xmap', 'wb') as csvfile: csvwriter = csv.writer(csvfile, delimiter='\t') # copies the old xmap header for x in header: csvwriter.writerow(x) i = 1 # progressive number # for steve -> # scaling = 1.02257561752017878915 # scaling fact from opt map to BP previous = 0 # previous qry contig, to remove dups for x in mtp: # save the contig in listfile only if it is a new one if (x.qry != previous): #listfile.write(str(x.qry)+'\n') previous = x.qry # for steve -> listfile.write( str(x.ref) + '\t' + str(x.qry) + '\t' + str(int(round(float(x.start) / scaling))) + '\t' + str(int(round(float(x.end) / scaling))) + '\t' + x.orientation + '\n') # dump the alignment csvwriter.writerow([i] + x.unpack()) i += 1 csvfile.close() listfile.close()
def load_args(self): parser = argparse.ArgumentParser( formatter_class=argparse.RawTextHelpFormatter) parser.add_argument("-a", "--alignment", required=True, help="\nEnter alignment file.\n\n") parser.add_argument("-s", "--symbol", required=True, choices=["protein", "nucleotide"], help="\nEnter type of alignment being inputted." "\n\n") parser.add_argument("-m", "--matrix", required=True, help="\nSelect similarity matrix.\n\n" "| PRESETS: blosum62, blosum90, blosum100,\n" "pam100, pam250, binary.\n\n" "| NOTE: If a preset is not chosen the program\n" "will assume you are loading a file. Make sure\n" "the file you load follows the standard format\n" "set by PAM and BLOSUM.\n\n" "| NOTE: Binary matrices ignore any\n" "similarities among disparate symbols.\n\n" "| NOTE: Nucelotide alignments that are not\n" "converted must use a binary matrix." "\n\n") parser.add_argument("-d", "--distribution", required=True, help="\nSelect sequences to define the\n" "background distribution.\n\n" "| PRESETS: self, swiss-prot\n\n" "| NOTE: If a preset is not chosen the program\n" "will assume you are loading a file. Make sure\n" "any file you load follows fasta formatting\n" "and is of the same symbol type (protein or \n" "nucleotide).\n\n" "| NOTE: If self is chosen the sequences\n" "of the inputted alignment file will be used\n" "for the calculation.\n\n") parser.add_argument("-c", "--convert", required=True, choices=["yes", "no"], help="\nConvert nucleotide alignment to protein\n" "alignment?\n\n" "| NOTE: Sequences in alignment must be of equal\n" "length to convert.\n\n" "|NOTE: First open reading frame is used.") args = parser.parse_args() matrix_input = args.matrix if args.symbol == "nucleotide" and args.convert == "no": if args.distribution == "swiss-prot": exit("Nucleotide alignment cannot use amino acid background " "distribution") if matrix_input != "binary": exit("Nucleotide alignments that are not converted must use a " "binary matrix") matrix_input = "binary-nucleotide" else: if args.symbol == "protein" and args.convert == "yes": exit( "Cannot convert protein alignment to nucleotide alignment") if matrix_input == "binary": matrix_input = "binary-protein" sim_matrix = Matrix(matrix_input) return Alignment(args.alignment, args.symbol, args.distribution, sim_matrix, args.convert)
def calculate_chunk(h, ref, beam_size): a = Alignment(h, ref) line1UsedWords = [False for _ in range(len(a.line1))] line2UsedWords = [False for _ in range(len(a.line2))] initialPath = PartialAlignment([None for _ in range(len(a.line2))], line1UsedWords, line2UsedWords) # init all possible matches for i in range(len(a.line1)): for j in range(len(a.line2)): if is_similar_word(a.line1[i], a.line2[j]): a.matches[j].append( Match(start=j, length=1, matchStart=i, matchLength=1, prob=1)) a.line1Coverage[i] += 1 a.line2Coverage[j] += 1 # One-to-one, non-overlapping matches are definite for i in range(len(a.matches)): if len(a.matches[i]) == 1: m = a.matches[i][0] overlap = False if (a.line2Coverage[i] != 1): overlap = True if (a.line1Coverage[m.matchStart] != 1): overlap = True if not overlap: initialPath.matches[i] = m initialPath.line2UsedWords[i] = True initialPath.line1UsedWords[m.matchStart] = True # Resolve best alignment using remaining matches paths = [] nextPaths = [initialPath] for current in range(len(a.matches) + 1): paths = nextPaths nextPaths = [] paths.sort(key=functools.cmp_to_key(compare)) # print(paths) # Try as many paths as beam allows numRank = min(beam_size, len(paths)) for rank in range(numRank): path = paths[rank] # Case: path is complete if current == len(a.matches): # Close last chunk if path.lastMatchEnd != -1: path.chunks += 1 nextPaths.append(path) continue # Case: Current index word is in use if path.line2UsedWords[current] is True: # If fixed match if path.matches[path.idx] != None: m = path.matches[path.idx] path.matchCount += 1 path.matches1 += 1 path.matches2 += 1 # Not continuous in line1 if path.lastMatchEnd != -1 and m.matchStart != path.lastMatchEnd: path.chunks += 1 # Advance to end of match + 1 path.idx = m.start + 1 path.lastMatchEnd = m.matchStart + 1 path.distance += abs(m.start - m.matchStart) nextPaths.append(path) continue # Case: Multiple possible matches, for each match starting at index start matches = a.matches[current] for i in range(len(matches)): m = matches[i] # Check to see if words are unused if path.isUsed(m): continue newPath = copy.deepcopy(path) # Select m for this start index newPath.setUsed(m, True) newPath.matches[current] = m # Calculate new stats newPath.matchCount += 1 newPath.matches1 += 1 newPath.matches2 += 1 # Not continuous in line1 if newPath.lastMatchEnd != -1 and m.matchStart != newPath.lastMatchEnd: newPath.chunks += 1 # Advance to end of match + 1 newPath.idx = m.start + 1 newPath.lastMatchEnd = m.matchStart + 1 newPath.distance += abs(m.start - m.matchStart) nextPaths.append(newPath) # Try skipping this index if path.lastMatchEnd != -1: path.chunks += 1 path.lastMatchEnd = -1 path.idx += 1 nextPaths.append(path) if len(nextPaths) == 0: print( "Warning: unexpected conditions - skipping matches until possible to continue" ) nextPaths.append(paths[0]) # Return top best path's chunk number nextPaths.sort(key=functools.cmp_to_key(compare)) return nextPaths[0].chunks, nextPaths[0].matchCount
from google.appengine.ext import ndb import jinja2 import webapp2 JINJA_ENVIRONMENT = jinja2.Environment( loader=jinja2.FileSystemLoader(os.path.dirname(__file__)), extensions=['jinja2.ext.autoescape'], autoescape=True) # [END imports] DEFAULT_GUESTBOOK_NAME = 'default_guestbook' aclass = Alignment() # [START main_page] class MainPage(webapp2.RequestHandler): def get(self): sequence_1 = self.request.get('sequence_1') sequence_2 = self.request.get('sequence_2') match = self.request.get('match') mismatch = self.request.get('mismatch') gap = self.request.get('gap') alignment = self.request.get('alignment') print sequence_1, sequence_2, match, mismatch, gap, alignment if 'global' in alignment.lower():
def traceback_process( scoring_matrix: List[List[Cell]], seq1: str, seq2: str, starting_cell: Cell ) -> Alignment: """This method computes the traceback process for retrieving an alignment Args: scoring_matrix (List[List[Cell]]): the scoring matrix seq1 (str): the first sequence seq2 (str): the second sequence starting_cell (Cell): the starting cell for the traceback process Returns: The alignment starting from starting_cell """ subseq1, subseq2 = "", "" max_gap_length = 0 min_gap_length = max(len(seq1), len(seq2)) n_gaps = 0 tmp_gap = None gap_direction = None actual_cell = starting_cell while actual_cell.score > 0: i, j = actual_cell.indices seq_i = seq1[i - 1] seq_j = seq2[j - 1] if actual_cell.origin == Move.DIAGONAL: if tmp_gap is not None: # If there was a gap end it and update counts max_gap_length = max(max_gap_length, tmp_gap) min_gap_length = min(min_gap_length, tmp_gap) n_gaps += 1 tmp_gap = None gap_direction = None subseq1 += seq_i subseq2 += seq_j actual_cell = scoring_matrix[i - 1][j - 1] elif actual_cell.origin == Move.HORIZONTAL: if gap_direction == Move.HORIZONTAL: tmp_gap += 1 else: if tmp_gap is not None: max_gap_length = max(max_gap_length, tmp_gap) min_gap_length = min(min_gap_length, tmp_gap) n_gaps += 1 tmp_gap = 1 gap_direction = Move.HORIZONTAL subseq1 += "-" subseq2 += seq_j actual_cell = scoring_matrix[i][j - 1] elif actual_cell.origin == Move.VERTICAL: if gap_direction == Move.VERTICAL: tmp_gap += 1 else: if tmp_gap is not None: max_gap_length = max(max_gap_length, tmp_gap) min_gap_length = min(min_gap_length, tmp_gap) n_gaps += 1 tmp_gap = 1 gap_direction = Move.VERTICAL subseq1 += seq_i subseq2 += "-" actual_cell = scoring_matrix[i - 1][j] else: raise Exception( f'Something went wrong origin must be one of {",".join([move for move in Move])}' ) if tmp_gap is not None: max_gap_length = max(max_gap_length, tmp_gap) min_gap_length = min(min_gap_length, tmp_gap) n_gaps += 1 return Alignment( subseq1[::-1], subseq2[::-1], max_gap_length, min_gap_length, n_gaps, starting_cell.score, starting_cell.indices, )
import cv2 import sys import numpy as np import datetime sys.path.append('../SSH') sys.path.append('../alignment') from ssh_detector import SSHDetector from alignment import Alignment from embedding import Embedding #short_max = 800 scales = [1200, 1600] t = 2 detector = SSHDetector('../SSH/model/e2ef', 0) alignment = Alignment('../alignment/model/3d_I5', 12) embedding = Embedding('./model/model', 0) out_filename = './out.png' f = '../sample-images/t1.jpg' if len(sys.argv) > 1: f = sys.argv[1] img = cv2.imread(f) im_shape = img.shape print(im_shape) target_size = scales[0] max_size = scales[1] im_size_min = np.min(im_shape[0:2]) im_size_max = np.max(im_shape[0:2]) if im_size_min > target_size or im_size_max > max_size: im_scale = float(target_size) / float(im_size_min)
import sys from alignment import Alignment file = sys.argv[1] #file = 'test_files/test.txt' args = open(file).readlines() flag = args[0].rstrip() scores = args[1].split() match = int(scores[0]) mismatch = int(scores[1]) indel = int(scores[2]) seq1= args[2].rstrip() seq2= args[3].rstrip() if flag == 'g': a = Alignment(match,mismatch,indel,seq1,seq2) a.single_global_single_align() a.report_optimal_score() elif flag == 'l': a = Alignment(match,mismatch,indel,seq1,seq2) a.local_single_align() a.report_optimal_score() else: print "Invalid alignment flag." file = open('results.txt', "w") file.write("Score:") file.write(a.get_optimal_score()) file.write("\n") file.write("Number of Optimal Alignments:") file.write(a.get_total_optimal_alignments())
def chipseq_genome(): args = get_args() if args.o is None: args.o = str(pathlib.Path.cwd()) # prep-dirs # subdirs = ['genome_mapping', 'count', 'bigWig', 'report', 'src'] prj_path = prepare_project(args.o) # path_out # |-genome_mapping # |-bigWig # |-macs2_output # |-transposon_analysis # |-src ## Alignment ## # control ctl_fqs = [f.name for f in args.c] if args.C is None: ctl_prefix = str_common([os.path.basename(f) for f in ctl_fqs]) ctl_prefix = ctl_prefix.rstrip('r|R|rep|Rep').rstrip('_|.') args.C = ctl_prefix ctl_path = os.path.join(prj_path['genome_mapping'], args.C) ctl_bam_files, ext_ctl_bam_files = Alignment( ctl_fqs, ctl_path, smp_name=args.C, genome=args.g, spikein=None, index_ext=args.x, threads=args.threads, unique_only=True, n_map=1, aligner=args.aligner, align_to_rRNA=True, merge_rep=False, path_data=args.path_data, overwrite=args.overwrite).run() # treatment tre_fqs = [f.name for f in args.t] if args.T is None: tre_prefix = str_common([os.path.basename(f) for f in tre_fqs]) tre_prefix = tre_prefix.rstrip('r|R|rep|Rep').rstrip('_|.') args.T = tre_prefix tre_path = os.path.join(prj_path['genome_mapping'], args.T) tre_bam_files, ext_tre_bam_files = Alignment( tre_fqs, tre_path, smp_name=args.T, genome=args.g, spikein=None, index_ext=args.x, threads=args.threads, unique_only=True, n_map=1, aligner=args.aligner, align_to_rRNA=True, merge_rep=False, path_data=args.path_data, overwrite=args.overwrite).run() # ## mapping stat ## # map_stat_path = prj_path['report'] # map_stat_file = os.path.join(map_stat_path, 'mapping.stat') # ctl_map = map_stat(ctl_path) # tre_map = map_stat(tre_path) # df_map = pd.concat([ctl_map, tre_map], axis=0).reset_index() # df_map = df_map.sort_values(['index']) # df_map.to_csv(map_stat_file, sep='\t', header=True, index=False) ################ ## call peaks ## ################ ## for each replicates for tre_bam in tre_bam_files: i = tre_bam_files.index(tre_bam) if i >= len(ctl_bam_files): i = 0 # the first one ctl_bam = ctl_bam_files[i] # output directory tre_bam_prefix = file_prefix(tre_bam)[0] tre_bam_path = os.path.join(prj_path['macs2_output'], tre_bam_prefix) p = Macs2(ip=tre_bam, control=ctl_bam, genome=args.g, output=tre_bam_path, prefix=tre_bam_prefix) # call peaks p.callpeak() p.bdgcmp(opt='ppois') p.bdgcmp(opt='FE') p.bdgcmp(opt='logLR') # annotate peaks p.broadpeak_annotation() ################### ## create bigWig ## ################### # map_bam_files = ctl_bam_files + tre_bam_files # bw_path = prj_path['bigWig'] # for bam in map_bam_files: # bam2bigwig( # bam=bam, # genome=args.g, # path_out=bw_path, # strandness=args.s, # binsize=args.bin_size, # overwrite=args.overwrite) ######################### ## transposon analysis ## ######################### if ext_tre_bam_files is None or ext_ctl_bam_files is None: logging.info('transposon analysis skipped') else: if isinstance(ctl_bam_files, list) and isinstance(tre_bam_files, list): # fetch the scale te_path = prj_path['transposon_analysis'] for i in ext_tre_bam_files: i_index = ext_tre_bam_files.index(i) # genome mapping BAM ext_tre_bam = i[0] tre_bam = tre_bam_files[i_index] if i_index >= len(ext_ctl_bam_files): i_index = 0 ext_ctl_bam = ext_ctl_bam_files[i_index][0] ctl_bam = ctl_bam_files[i_index] # fetch the normalize scale tre_bam_prefix = file_prefix(tre_bam)[0] tre_bam_path = os.path.join(prj_path['macs2_output'], tre_bam_prefix) p = Macs2(ip=tre_bam, control=ctl_bam, genome=args.g, output=tre_bam_path, prefix=tre_bam_prefix) d = p.get_effect_size( ) # ip_scale, ip_depth, input_scale, input_depth # bam to bigWig te_sub_path = os.path.join(te_path, tre_bam_prefix) bam2bigwig2(ext_tre_bam, te_sub_path, scale=d['ip_scale'], overwrite=args.overwrite) bam2bigwig2(ext_ctl_bam, te_sub_path, scale=d['input_scale'], overwrite=args.overwrite) # save scale to file s1 = os.path.join(te_sub_path, 'scale.pickle') s2 = os.path.join(te_sub_path, 'scale.lib') args_checker(d, s1) args_logger(d, s2) # create coverage plots pdf_out = os.path.join(te_sub_path, tre_bam_prefix + '.track_view.pdf') fasize = 'abc.fa' bigwig2track_single(ext_tre_bam, ext_ctl_bam, fasize, 'P5', pdf_out)
import cv2 import sys import numpy as np import datetime from alignment import Alignment sys.path.append('../SSH') from ssh_detector import SSHDetector #short_max = 800 scales = [1200, 1600] t = 2 detector = SSHDetector('../SSH/model/e2ef', 0) alignment = Alignment('./model/3d_I5', 12) out_filename = './out.png' f = '../sample-images/t1.jpg' if len(sys.argv)>1: f = sys.argv[1] img = cv2.imread(f) im_shape = img.shape print(im_shape) target_size = scales[0] max_size = scales[1] im_size_min = np.min(im_shape[0:2]) im_size_max = np.max(im_shape[0:2]) if im_size_min>target_size or im_size_max>max_size: im_scale = float(target_size) / float(im_size_min) # prevent bigger axis from being more than max_size: if np.round(im_scale * im_size_max) > max_size: im_scale = float(max_size) / float(im_size_max)
def read_alignments(self, input_dir, file_format, header_grep=None, compression=None): """ Get list of alignment files from an input directory *.fa, *.fas and *.phy files only Stores in self.files """ optioncheck(compression, [None, 'gz', 'bz2']) if file_format == 'fasta': extensions = ['fa', 'fas', 'fasta'] elif file_format == 'phylip': extensions = ['phy'] else: extensions = [] if compression: extensions = ['.'.join([x, compression]) for x in extensions] files = fileIO.glob_by_extensions(input_dir, extensions) files.sort(key=SORT_KEY) self._input_files = files records = [] pbar = setup_progressbar("Loading files", len(files), simple_progress=True) pbar.start() for i, f in enumerate(files): if compression is not None: with fileIO.TempFile() as tmpfile: with fileIO.freader(f, compression) as reader, fileIO.fwriter( tmpfile) as writer: for line in reader: writer.write(line) try: record = Alignment(tmpfile, file_format, True) except RuntimeError: record = Alignment(tmpfile, file_format, False) else: try: record = Alignment(f, file_format, True) except RuntimeError: record = Alignment(f, file_format, False) if header_grep: try: datatype = 'dna' if record.is_dna() else 'protein' record = Alignment([(header_grep(x), y) for (x, y) in record.get_sequences()], datatype) except TypeError: raise TypeError("Couldn't apply header_grep to header\n" "alignment number={}, name={}\n" "header_grep={}".format( i, fileIO.strip_extensions(f), header_grep)) except RuntimeError: print( 'RuntimeError occurred processing alignment number={}, name={}' .format(i, fileIO.strip_extensions(f))) raise record.name = (fileIO.strip_extensions(f)) records.append(record) pbar.update(i) pbar.finish() return records