Esempio n. 1
0
    def make_alignment(self, source_alignment_path):
        # Make the alignment
        self.alignment = Alignment()
        self.alignment.read(source_alignment_path)

        # TODO REMOVE -- this should be part of the checking procedure
        # We start by copying the alignment
        self.alignment_path = os.path.join(the_config.start_tree_path,
                                           'source.phy')
        if os.path.exists(self.alignment_path):
            # Make sure it is the same
            old_align = Alignment()
            old_align.read(self.alignment_path)
            if not old_align.same_as(self.alignment):
                log.error("""Alignment file has changed since previous run. You
                     need to use the force-restart option.""")
                raise AnalysisError

            compare = lambda x, y: collections.Counter(
                x) == collections.Counter(y)

            if not compare(old_align.species, self.alignment.species):
                log.error(
                    """Species names in alignment have changed since previous run. You
                     need to use the force-restart option.""")
                raise AnalysisError

        else:
            self.alignment.write(self.alignment_path)
Esempio n. 2
0
 def build_expr(self, context, expr, filter=None, align=None):
     score_expr = LogitScore(expr)
     if align is not None:
         # we do not need add_filter because Alignment already handles it
         return Alignment(score_expr, align, filter=filter)
     else:
         return self.add_filter(ComparisonOp('>', score_expr, 0.5), filter)
    def generate_xml_tree(self):
        """
        Try to parse xml, generate tree with xml tags and then cast it to mainAligment object and Alignment
        :return: exception when file has't got correct content
        """
        try:
            tree = et.parse(self.file)
            self.root = tree.getroot()
            self.blast_output = self.root[8]
            self.iteration = self.blast_output[0]
            self.iteration_hit = self.iteration[4]

            for i in self.iteration_hit:
                self.hits.append(i)

            for i in self.hits:
                h = []
                for j in i:
                    h.append(j)

                for hsp in h[5]:
                    procent = "{0:.2f}".format(
                        int(hsp[10].text) / int(hsp[13].text) * 100)
                    procent = float(procent)
                    self.aligns.append(
                        Alignment(h[2].text, hsp[1].text, procent,
                                  hsp[12].text, hsp[10].text, hsp[13].text,
                                  hsp[14].text, hsp[15].text, hsp[16].text))
                self.main_alignments.append(
                    MainAlignment(i[2].text, self.aligns))
                self.aligns = []
        except IndexError:
            "Bad file."
Esempio n. 4
0
    def make_alignment(self, cfg, alignment):
        # Make an Alignment from the source, using this subset
        sub_alignment = SubsetAlignment(alignment, self)
        sub_path = os.path.join(cfg.phylofiles_path, self.name + '.phy')
        # Add it into the sub, so we keep it around
        self.alignment_path = sub_path

        # Maybe it is there already?
        if os.path.exists(sub_path):
            log.debug("Found existing alignment file %s", sub_path)
            old_align = Alignment()
            old_align.read(sub_path)

            # It had better be the same!
            if not old_align.same_as(sub_alignment):
                log.error(
                    "It looks like you have changed one or more of the "
                    "data_blocks in the configuration file, "
                    "so the new subset alignments "
                    "don't match the ones stored for this analysis. "
                    "You'll need to run the program with --force-restart")
                raise SubsetError
        else:
            # We need to write it
            sub_alignment.write(sub_path)
Esempio n. 5
0
    def alignment(self):
        """Make self into an alignment, and return it.

        If all the sequences are the same length and type, then self,
        a sequenceList, could be an Alignment.  This method generates
        an Alignment instance, runs the Alignment method
        checkLengthsAndTypes(), and returns the Alignment.

        If you feed p4 a fasta sequence, it makes SequenceList object,
        and runs this method on it.  If it works then p4 puts the
        Alignment object in var.alignments, and if not it puts the
        SequenceList object in var.sequenceLists.

        It is possible that p4 might think that some short sequences
        are DNA when they are really protein.  In that case it will
        fail to make an alignment, because it will fail the types
        check.  So what you can do is something like this::

            sl = var.sequenceLists[0]
            for s in sl.sequences:
                s.dataType = 'protein'
            a = sl.alignment()

        """

        from alignment import Alignment
        a = Alignment()
        a.fName = self.fName
        import copy
        a.sequences = copy.deepcopy(self.sequences)  # self will be deleted
        a.fName = self.fName
        a.checkLengthsAndTypes()
        return a
Esempio n. 6
0
    def permuted_copy(self, partition=None):
        """ Return a copy of the collection with all alignment columns permuted
        """
        def take(n, iterable):
            return [iterable.next() for _ in range(n)]

        if partition is None:
            partition = Partition([1] * len(self))

        index_tuples = partition.get_membership()

        alignments = []
        for ix in index_tuples:
            concat = Concatenation(self, ix)
            sites = concat.alignment.get_sites()
            random.shuffle(sites)
            d = dict(
                zip(concat.alignment.get_names(),
                    [iter(x) for x in zip(*sites)]))
            new_seqs = [[(k, ''.join(take(l, d[k]))) for k in d]
                        for l in concat.lengths]

            for seqs, datatype, name in zip(new_seqs, concat.datatypes,
                                            concat.names):
                alignment = Alignment(seqs, datatype)
                alignment.name = name
                alignments.append(alignment)

        return self.__class__(
            records=sorted(alignments, key=lambda x: SORT_KEY(x.name)))
Esempio n. 7
0
def main():
    args = args_init(vars(get_args()), align=True)  # save as dictionary

    # log.info('aaaaa')

    # args['align_to_te'] = True

    ## run alignment
    map_bam_list = Alignment(**args).run()
Esempio n. 8
0
    def execute(self):

        # Alignment
        # TODO: choose mode automatically
        msa = Alignment(messages=self.messages,
                        output_dir=self.output_dir,
                        mode=self.mode,
                        multithread=self.multithread)
        #msa = Alignment(messages=self.messages, output_dir=self.output_dir, multithread=True)
        msa.execute()
        # exit()

        # Generate fields
        filepath_fields_info = os.path.join(self.output_dir,
                                            Alignment.FILENAME_FIELDS_INFO)
        self.fields, fid_list = self.generate_fields_by_fieldsinfo(
            filepath_fields_info)
        logging.debug("Number of keyword candidates: {}\nfid: {}".format(
            len(fid_list), fid_list))

        # Compute probabilities of observation constraints
        constraint = Constraint(messages=self.messages,
                                direction_list=self.direction_list,
                                fields=self.fields,
                                fid_list=fid_list,
                                output_dir=self.output_dir)

        pairs_p, pairs_size = constraint.compute_observation_probabilities()
        pairs_p_request, pairs_p_response = pairs_p
        pairs_size_request, pairs_size_response = pairs_size
        constraint.save_observation_probabilities(pairs_p_request,
                                                  pairs_size_request,
                                                  Constraint.TEST_TYPE_REQUEST)
        constraint.save_observation_probabilities(
            pairs_p_response, pairs_size_response,
            Constraint.TEST_TYPE_RESPONSE)

        # pairs_p_request, pairs_size_request = constraint.load_observation_probabilities(Constraint.TEST_TYPE_REQUEST)
        # pairs_p_response, pairs_size_response = constraint.load_observation_probabilities(Constraint.TEST_TYPE_RESPONSE)
        # print(pairs_p_request, pairs_size_request)
        # print(pairs_p_response, pairs_size_response)

        # Probabilistic inference
        pairs_p_all, pairs_size_all = self.merge_constraint_results(
            pairs_p_request, pairs_p_response, pairs_size_request,
            pairs_size_response)

        ffid_list = ["{0}-{0}".format(fid)
                     for fid in fid_list]  #only test same fid for both sides
        pi = ProbabilisticInference(pairs_p=pairs_p_request,
                                    pairs_size=pairs_size_request)
        fid_inferred = pi.execute(ffid_list)

        ## TODO: iterative
        ## TODO: format inference

        return fid_inferred
Esempio n. 9
0
 def get_results(self):
     if self.file_read_job == None:
         return self.results
     else:
         # self.results=read_internal_alignment(self.alignedfn,)
         alignment = Alignment()
         alignment.datatype = self.datatype
         alignment.read_filepath(self.alignedfn, file_format='FASTA')
         self.results = alignment
         return self.results
Esempio n. 10
0
    def make_alignment(self, source_alignment_path):
        # Make the alignment
        self.alignment = Alignment()
        self.alignment.read(source_alignment_path)

        # We start by copying the alignment
        self.alignment_path = os.path.join(self.cfg.start_tree_path,
                                           'source.phy')
        if os.path.exists(self.alignment_path):
            # Make sure it is the same
            old_align = Alignment()
            old_align.read(self.alignment_path)
            if not old_align.same_as(self.alignment):
                log.error(
                    "Alignment file has changed since previous run. You need to use the force-restart option."
                )
                raise AnalysisError

        else:
            self.alignment.write(self.alignment_path)
Esempio n. 11
0
    def __init__(self, gui, parent=None):
        """
        Establish the connection with the main gui, set some instance variables and initialize all
        flags to False.

        :param gui: main gui object
        """

        QtCore.QThread.__init__(self, parent)
        self.gui = gui

        # Create the alignment object. Alignment points are kept throughout the whole program
        # execution, even if the telescope driver or other configuration parameters are changed.
        self.al = Alignment(self.gui.configuration, debug=self.gui.configuration.alignment_debug)

        self.exiting = False

        self.output_channel_initialization_flag = False
        self.telescope_initialization_flag = False
        self.camera_initialization_flag = False
        self.new_tesselation_flag = False
        self.slew_to_alignment_point_flag = False
        self.perform_alignment_flag = False
        self.perform_autoalignment_flag = False
        self.slew_to_moon_limb_flag = False
        self.set_focus_area_flag = False
        self.goto_focus_area_flag = False
        self.slew_to_tile_and_record_flag = False
        self.move_to_selected_tile_flag = False
        self.escape_pressed_flag = False

        # Save the descriptor of standard output. Stdout might be redirected to a file and back
        # later.
        self.stdout_saved = sys.stdout

        # Initialize status variables.
        self.output_redirected = False
        self.telescope_connected = False
        self.camera_connected = False
        self.tesselation_created = False

        # Initialize some instance variables.
        self.active_tile_number = -1
        self.all_tiles_recorded = False
        self.protocol_file = None
        self.telescope = None
        self.camera = None
        self.date_time = None
        self.me = None
        self.tc = None
        self.repeat_from_here = None
        self.tile_indices_since_last_autoalign = None

        self.start()
Esempio n. 12
0
def te_aligner(fq1_files, smp_name, args, fq2_files=None):
    """Mapping reads to genome
    control or treatment
    args dict, the arguments of pipeline
    check index
    1. rRNA
    2. genome
    3. spike-in-rRNA
    4. spike-in
    """
    project_path = init_rnaseq_project(args['path_out'], analysis_type=1)
    te_align_path = project_path['transposon']

    args['extra_index'] = None  # pre-build

    # ## qc-report
    # qc_path = os.path.join(te_align_path['report'], 'qc')
    # QC_reporter(fq1_files, qc_path).run() ## skip, run in gene_aligner

    ## update args
    args['fq1'] = fq1_files
    args['fq2'] = fq2_files
    args['path_out'] = te_align_path['mapping']
    args['smp_name'] = smp_name
    args['align_to_te'] = True

    # extra small genome
    small_genome = args['small_genome']
    args['small_genome'] = True

    ## run alignment
    map_bam_list = Alignment(**args).run()
    map_bam = [item for sublist in map_bam_list for item in sublist]

    # create bigWig files
    # for bam in map_bam:
    #     bam2bigwig(
    #         bam=bam,
    #         genome=args['genome'],
    #         path_out=te_align_path['bigWig'],
    #         strandness=args['s'],
    #         binsize=args['bin_size'],
    #         overwrite=args['overwrite'])

    ## return
    args['small_genome'] = small_genome

    return map_bam
Esempio n. 13
0
    def __init__(self, line_string):
        self.type = self.TYPE_HEADER if line_string.startswith('@') \
                else self.TYPE_ALIGNMENT

        if self.type == self.TYPE_HEADER:
            self.fields = [line_string]
            return

        self.fields = line_string.split()
        pos, cigar = self.fields[3], self.fields[5]

        if cigar == '*':
            raise CigarUnavailableError

        md = next(filter(lambda field: field.startswith('MD:Z:'), self.fields))
        md = md.replace('MD:Z:', '')
        self.alignment = Alignment(pos, cigar, md)
Esempio n. 14
0
def gene_aligner(fq1_files, smp_name, args, fq2_files=None):
    """Mapping reads to genome
    control or treatment
    args dict, the arguments of pipeline
    check index
    1. rRNA
    2. genome
    3. spike-in-rRNA
    4. spike-in
    """
    project_path = init_rnaseq_project(args['path_out'], analysis_type=1)
    gene_align_path = project_path['gene']

    ## qc-report
    qc_path = os.path.join(gene_align_path['report'], 'qc')
    # QC_reporter(fq1_files, qc_path).run()

    ## update args
    args['fq1'] = fq1_files
    args['fq2'] = fq2_files
    args['path_out'] = gene_align_path['mapping']
    args['smp_name'] = smp_name
    args['align_to_te'] = False

    ## run alignment
    map_bam_list = Alignment(**args).run()

    ## filt map_genome
    map_bam = []
    for i in map_bam_list:
        for k in i:
            if k.endswith('map_' + args['genome'] + '.bam'):
                map_bam.append(k)

    # # create bigWig files
    # for bam in map_bam:
    #     bam2bigwig(
    #         bam=bam,
    #         genome=args['genome'],
    #         path_out=gene_align_path['bigWig'],
    #         strandness=args['s'],
    #         binsize=args['bin_size'],
    #         overwrite=args['overwrite'])

    return map_bam
Esempio n. 15
0
def read_internal_alignment(fn,
                            file_format='FASTA',
                            datatype=None,
                            dirs_to_delete=(),
                            temp_fs=None):
    alignment = Alignment()
    alignment.datatype = datatype
    alignment.read_filepath(fn, file_format=file_format)
    if len(alignment) >= 1:
        if dirs_to_delete:
            assert (temp_fs)
            for d in dirs_to_delete:
                time.sleep(.1)  #TODO: not sure why this is here!
                temp_fs.remove_dir(d)
        return alignment
    else:
        raise ValueError(
            "The alignment file %s has no sequences. PASTA quits." % fn)
Esempio n. 16
0
def solveAlignment(method, fileName):
    alignment = Alignment(fileName)  # Se crea el archivo
    alignment.readFile()  # Se lee el archivo con la información
    if method == '1':  # Se elige fuerza bruta
        # start = datetime.now()
        result, result1, result2 = alignment.bruteForceSolving()  # Se resuelve
        alignment.printBruteForce(result, result1,
                                  result2)  # Se imprime los resultados
        # print(datetime.now() - start)
    elif method == '2':
        start = datetime.now()
        matrix, moves, result, result1, result2 = alignment.dynamicSolving(
        )  # Se resuelve
        alignment.printDynamic(matrix, moves, result, result1,
                               result2)  # Se imprime los resultados
        # print(datetime.now() - start)
    else:
        error(
            "Error, revise que utilice los parametros correctos. \n Utilize [-h] para ayuda."
        )
Esempio n. 17
0
    def simulate(self, partition, outdir, batchsize=1, **kwargs):
        """
        Simulate a set of alignments from the parameters inferred on a partition
        :param partition:
        :return:
        """
        indices = partition.get_membership()
        self.add_lnl_partitions(partition, **kwargs)
        results = [self.lnl_cache[ix] for ix in indices]
        places = dict((j, i) for (i, j) in enumerate(
            rec.name for rec in self.collection.records))

        # Collect argument list
        args = [None] * len(self.collection)
        for result in results:
            for partition in result['partitions'].values():
                place = places[partition['name']]
                args[place] = (len(self.collection[place]),
                               model_translate(partition['model']),
                               partition['frequencies'], partition['alpha'],
                               result['ml_tree'], partition['rates']
                               if 'rates' in partition else None)

        # Distribute work
        msg = 'Simulating'
        client = get_client()
        if client is None:
            map_result = sequential_map(client, tasks.simulate_task, args, msg)
        else:
            map_result = parallel_map(client, tasks.simulate_task, args, msg,
                                      batchsize, background)
            if background:
                return map_result

        # Process results
        for i, result in enumerate(map_result):
            orig = self.collection[i]
            simseqs = gapmask(result, orig.get_sequences())
            al = Alignment(simseqs, 'protein' if orig.is_protein() else 'dna')
            outfile = os.path.join(outdir, orig.name + '.phy')
            al.write_alignment(outfile, 'phylip', True)
Esempio n. 18
0
    def make_alignment(self, cfg, alignment):
        # Make an Alignment from the source, using this subset
        sub_alignment = SubsetAlignment(alignment, self)

        sub_path = os.path.join(cfg.phylofiles_path, self.subset_id + '.phy')
        # Add it into the sub, so we keep it around
        self.alignment_path = sub_path

        # Maybe it is there already?
        if os.path.exists(sub_path):
            log.debug("Found existing alignment file %s" % sub_path)
            old_align = Alignment()
            old_align.read(sub_path)

            # It had better be the same!
            if not old_align.same_as(sub_alignment):
                log.error(self.FORCE_RESTART_MESSAGE)
                raise SubsetError
        else:
            # We need to write it
            sub_alignment.write(sub_path)
Esempio n. 19
0
def extra_aligner(fq1_files, smp_name, args, fq2_files=None):
    """Mapping reads to genome
    control or treatment
    args dict, the arguments of pipeline
    check index
    1. rRNA
    2. genome
    3. spike-in-rRNA
    4. spike-in
    """
    project_path = init_rnaseq_project(args['path_out'], analysis_type=1)
    extra_align_path = project_path['extra']

    ## qc-report
    qc_path = os.path.join(extra_align_path['report'], 'qc')
    # QC_reporter(fq1_files, qc_path).run()

    ## update args
    args['fq1'] = fq1_files
    args['fq2'] = fq2_files
    args['path_out'] = extra_align_path['mapping']
    args['smp_name'] = smp_name
    args['align_to_te'] = False

    # extra small genome, for STAR
    small_genome = args['small_genome']
    args['small_genome'] = True

    ## run alignment
    map_bam = Alignment(**args).run()

    ## return
    args['small_genome'] = small_genome

    ## return
    return map_bam
Esempio n. 20
0
def pre_process(optmap_i, optmap_file, myfile, myfile2, output_dir,
                min_confidence):
    header_lines = 10
    header = []
    minrefoverhang = 50000
    minqryoverhang = 50000

    all_alms = {
    }  # stores all the Alignments for all groups, all_groups[ref] should contain molecule ref
    qualify_alms = {
    }  # only keep one alignment(the one with highest confidence) for each contig in one molecule
    removed = {
    }  # removed[ref,qry] == True means alignment for (ref, qry) is already removed

    # collecting alignments and store in all_groups
    print '---------------read .xmap file-------------------'
    with open(myfile + '_flip.xmap', 'rb') as csvfile:
        csvreader = csv.reader(csvfile, delimiter='\t')
        for i in range(header_lines):  # 10 lines of header
            header.append(csvreader.next())  # save them
        # read the first non-header line
        while True:
            try:
                row = csvreader.next()
                x = Alignment(int(row[1]), int(row[2]), float(row[3]),
                              float(row[4]), float(row[5]), float(row[6]),
                              row[7], float(row[8]), row[9], float(row[10]),
                              float(row[11]), int(row[12]), row[13])
                if x.ref not in all_alms:
                    all_alms[x.ref] = [x]
                else:
                    all_alms[x.ref].append(x)
            except StopIteration:
                break
    num_all_alms = 0
    for ref in all_alms:
        num_all_alms += len(all_alms[ref])
    print "In total, the number of alignments collected is ", num_all_alms

    # only keep one alignment(the one with highest confidence) for each contig in one molecule
    for ref in all_alms:
        group = all_alms[ref]
        qry_bestx = {}
        for x in group:
            if x.qry not in qry_bestx:
                qry_bestx[x.qry] = x
            else:
                if x.confidence > qry_bestx[x.qry].confidence:
                    qry_bestx[x.qry] = x

        qualify_alms[ref] = {}
        for qry in qry_bestx:
            qualify_alms[ref][qry] = qry_bestx[qry]

    num_qualify_alms = 0
    for ref in qualify_alms:
        num_qualify_alms += len(qualify_alms[ref])
    # initialize removed array
    for ref in qualify_alms:
        for qry in qualify_alms[ref]:
            removed[ref, qry] = False
    current_alms = copy_alms(qualify_alms, removed)
    output_alms(current_alms,
                output_dir + "/opt_" + str(optmap_i) + "_alms_0_initial.log")
    print "In total, the number of alignments in qualify_alms is ", num_qualify_alms

    # remove low confidence alignments
    print '---------------Remove low quality alignments---------------'
    for ref in qualify_alms:
        for qry in qualify_alms[ref]:
            x = qualify_alms[ref][qry]
            if x.confidence < min_confidence:
                removed[ref, qry] = True
                print 'alignment (', ref, ',', qry, ') is low quality and removed'
    num_alms = 0
    for ref in qualify_alms:
        for qry in qualify_alms[ref]:
            if removed[ref, qry] == False:
                num_alms += 1
    current_alms = copy_alms(qualify_alms, removed)
    output_alms(
        current_alms,
        output_dir + "/opt_" + str(optmap_i) + "_alms_1_removed_low_conf.log")
    print "After removing low confidence alignments, the number of alignments is ", num_alms
    print '---------------End---------------'

    # read optical map
    optmap = {}
    with open(optmap_file) as f_map:
        for line in f_map:
            line = line.strip()
            if line[0] == '#':
                continue
            cols = line.split('\t')
            CMapId = int(cols[0])
            LabelChannel = cols[4]
            Position = float(cols[5])

            if CMapId not in optmap:
                optmap[CMapId] = []
            if LabelChannel == "1":
                optmap[CMapId].append(Position)
    for CMapId in optmap:
        optmap[CMapId].sort()

    print '---------------scaling-------------------'
    # calculating scaling
    qry_len = {}
    with open(myfile2 + '_key.txt') as f_key:
        for i in range(0, 4):  # 4 header lines
            f_key.readline()
        for line in f_key:
            line = line.strip()
            cols = line.split('\t')
            qry_id = int(cols[0])
            seq_len = int(cols[2])
            qry_len[qry_id] = seq_len
    scaling = 0
    num = 0
    with open(myfile + '_r.cmap') as f_q:
        for i in range(0, 11):  # 11 header lines
            f_q.readline()
        for line in f_q:
            line = line.strip()
            cols = line.split('\t')
            qry_id = int(cols[0])
            appr_len = float(cols[1])
            seq_len = qry_len[qry_id]
            scaling += appr_len / seq_len
            num += 1
    scaling /= num  # scaling=1.02258059775
    scaling = 1.0
    # use scaling to adjsut coordinates of alignments
    for ref in qualify_alms:
        for qry in qualify_alms[ref]:
            x = qualify_alms[ref][qry]
            x.qrystartpos /= scaling
            x.qryendpos /= scaling
            x.qrylen /= scaling
            x.refstartpos /= scaling
            x.refendpos /= scaling
            x.reflen /= scaling

    # use scaling to adjsut coordinates of optial map
    for ref in optmap:
        for i in range(0, len(optmap[ref])):
            optmap[ref][i] /= scaling

    print '---------------END-------------------'

    # find the reference-based coordinates for each contig
    for ref in qualify_alms:
        for qry in qualify_alms[ref]:
            x = qualify_alms[ref][qry]
            if (x.orientation == '+'):
                x.qry_left_overlen = x.qrystartpos
                x.qry_right_overlen = x.qrylen - x.qryendpos
            else:
                x.qry_left_overlen = x.qrylen - x.qrystartpos
                x.qry_right_overlen = x.qryendpos
            x.start = x.refstartpos - x.qry_left_overlen
            x.end = x.refendpos + x.qry_right_overlen
            x.ref_left_overlen = x.refstartpos
            x.ref_right_overlen = x.reflen - x.refendpos
            if (x.orientation == '+'):
                x.refstart = x.qrystartpos - x.ref_left_overlen
                x.refend = x.qryendpos + x.ref_right_overlen
            else:
                x.refstart = x.qryendpos - x.ref_right_overlen
                x.refend = x.qrystartpos + x.ref_left_overlen

    num_alms = 0
    for ref in qualify_alms:
        for qry in qualify_alms[ref]:
            if removed[ref, qry] == False:
                num_alms += 1
    current_alms = copy_alms(qualify_alms, removed)
    output_alms(current_alms,
                output_dir + "/opt_" + str(optmap_i) + "_alms_2_scaled.log")
    print "After scaling, the number of alignments is ", num_alms

    # read qry map
    qry_markers = {}
    with open(myfile + '_r.cmap') as f_q:
        for i in range(11):  # 10 lines of header
            header_line = f_q.readline()
        for line in f_q:
            line = line.strip()
            cols = line.split('\t')
            CMapId = int(cols[0])
            ContigLength = float(cols[1])
            NumSites = int(cols[2])
            SiteID = int(cols[3])
            LabelChannel = cols[4]
            Position = float(cols[5])
            if LabelChannel == "0":
                continue
            if CMapId not in qry_markers:
                qry_markers[CMapId] = []
            Position /= scaling
            qry_markers[CMapId].append(Position)
    for CMapId in qry_markers:
        qry_markers[CMapId].sort()
    f_q.close()

    print '---------------candidate cutting sites-------------------'
    fpair = file(output_dir + "/chimeric_pairs_" + str(optmap_i) + ".log", 'w')
    fpair.write("ref_id\tref_pos\tqry_id\tqry_pos\n")
    chimeric_pairs = []

    for ref in qualify_alms:
        for qry in qualify_alms[ref]:
            if removed[ref, qry] == True:
                continue
            x = qualify_alms[ref][qry]

            if (x.confidence > min_confidence):
                ref_left_overlen = x.refstartpos
                ref_right_overlen = x.reflen - x.refendpos
                flag_left = False
                flag_right = False
                if (x.qry_left_overlen > minqryoverhang
                        and ref_left_overlen > minrefoverhang
                        and markers_in_qry_left_overhang(qry_markers, x) > 0):
                    flag_left = True
                    chimeric_pairs.append(
                        (x.ref, x.refstartpos, x.qry, x.qrystartpos))
                    print(
                        x.ref, x.refstartpos, x.qry,
                        x.qrystartpos), "is a pair of candidate cutting sites"
                    fpair.write(
                        str(x.ref) + "\t" + str(x.refstartpos) + "\t" +
                        str(x.qry) + "\t" + str(x.qrystartpos) + "\n")
                if (x.qry_right_overlen > minqryoverhang
                        and ref_right_overlen > minrefoverhang
                        and markers_in_qry_right_overhang(qry_markers, x) > 0):
                    flag_right = True
                    chimeric_pairs.append(
                        (x.ref, x.refendpos, x.qry, x.qryendpos))
                    print(x.ref, x.refendpos, x.qry,
                          x.qryendpos), "is a pair of candidate cutting sites"
                    fpair.write(
                        str(x.ref) + "\t" + str(x.refendpos) + "\t" +
                        str(x.qry) + "\t" + str(x.qryendpos) + "\n")
                if flag_left == True and flag_right == True:
                    removed[ref, qry] = True
    fpair.close()
    print '---------------END-------------------'
    num_alms = 0
    for ref in qualify_alms:
        for qry in qualify_alms[ref]:
            if removed[ref, qry] == False:
                num_alms += 1
    current_alms = copy_alms(qualify_alms, removed)
    output_alms(
        current_alms, output_dir + "/opt_" + str(optmap_i) +
        "_alms_3_removed_both_overhang.log")
    print "After removing alignments with both overhangs, the number of alignments is ", num_alms

    # check overlap between alignments
    for r in qualify_alms:
        for q1 in qualify_alms[r]:
            if removed[r, q1] == True:
                continue
            x = qualify_alms[r][q1]
            for q2 in qualify_alms[r]:
                if removed[r, q2] == True:
                    continue
                y = qualify_alms[r][q2]
                if q1 >= q2:
                    continue
                if x.refstartpos <= y.refstartpos and y.refstartpos <= x.refendpos:
                    overlap = min(x.refendpos, y.refendpos) - y.refstartpos
                elif y.refstartpos <= x.refstartpos and x.refstartpos <= y.refendpos:
                    overlap = min(x.refendpos, y.refendpos) - x.refstartpos
                else:
                    overlap = 0
                if overlap >= 20000:
                    if x.confidence < y.confidence:
                        removed[r, q1] = True
                    else:
                        removed[r, q2] = True
    num_alms = 0
    for ref in qualify_alms:
        for qry in qualify_alms[ref]:
            if removed[ref, qry] == False:
                num_alms += 1
    current_alms = copy_alms(qualify_alms, removed)
    output_alms(
        current_alms,
        output_dir + "/opt_" + str(optmap_i) + "_alms_4_solved_overlaps.log")
    print "After removing one of two overlap alignments, the number of alignments is ", num_alms

    return current_alms, optmap, chimeric_pairs
Esempio n. 21
0
def mtp(myfile, myfile2, output_dir, GLPSOL, false_alm_threshold,
        min_confidence):

    # discard alignments below min_confidence
    #min_confidence = 25
    header_lines = 10
    header = []
    # alignment overhangs above this number of bps are considered chimeric
    #minrefoverhang = 100000
    #minqryoverhang = 100000

    all_alms = {
    }  # stores all the Alignments for all groups, all_groups[ref] should contain molecule ref
    qualify_alms = {
    }  # only keep one alignment(the one with highest confidence) for each contig in one molecule
    removed = {
    }  # removed[ref,qry] == True means alignment for (ref, qry) is already removed

    # collecting alignments and store in all_groups
    print '---------------read .xmap file-------------------'
    with open(myfile + '.xmap', 'rb') as csvfile:
        csvreader = csv.reader(csvfile, delimiter='\t')
        for i in range(header_lines):  # 10 lines of header
            header.append(csvreader.next())  # save them
        # read the first non-header line
        while True:
            try:
                row = csvreader.next()
                x = Alignment(int(row[1]), int(row[2]), float(row[3]),
                              float(row[4]), float(row[5]), float(row[6]),
                              row[7], float(row[8]), row[9], float(row[10]),
                              float(row[11]), int(row[12]), row[13])
                if x.ref not in all_alms:
                    all_alms[x.ref] = [x]
                else:
                    all_alms[x.ref].append(x)
            except StopIteration:
                break
    num_all_alms = 0
    for ref in all_alms:
        #print 'collected', len(all_alms[ref]), 'alignments for molecule', ref
        num_all_alms += len(all_alms[ref])
    print "In total, the number of alignments collected is ", num_all_alms

    # only keep one alignment(the one with highest confidence) for each contig in one molecule
    for ref in all_alms:
        group = all_alms[ref]
        qry_bestx = {}
        for x in group:
            if x.qry not in qry_bestx:
                qry_bestx[x.qry] = x
            else:
                if x.confidence > qry_bestx[x.qry].confidence:
                    qry_bestx[x.qry] = x

        qualify_alms[ref] = {}
        for qry in qry_bestx:
            qualify_alms[ref][qry] = qry_bestx[qry]

    num_qualify_alms = 0
    for ref in qualify_alms:
        num_qualify_alms += len(qualify_alms[ref])
    print "In total, the number of alignments in qualify_alms is ", num_qualify_alms

    # initialize removed array
    for ref in qualify_alms:
        for qry in qualify_alms[ref]:
            removed[ref, qry] = False

    # find the reference-based coordinates for each alignments
    for ref in qualify_alms:
        for qry in qualify_alms[ref]:
            x = qualify_alms[ref][qry]
            if (x.orientation == '+'):
                x.qry_left_overlen = x.qrystartpos
                x.qry_right_overlen = x.qrylen - x.qryendpos
            else:
                x.qry_left_overlen = x.qrylen - x.qrystartpos
                x.qry_right_overlen = x.qryendpos
            x.start = x.refstartpos - x.qry_left_overlen
            x.end = x.refendpos + x.qry_right_overlen

    current_alms = copy_alms(qualify_alms, removed)
    output_alms(current_alms, output_dir + "/alms_0_initial.log")
    print "Initially, the number of alignments is", count_alms(current_alms)
    alms_0 = copy_alms(qualify_alms, removed)
    aligned_contigs = different_contigs(alms_0, {})
    output_contigs(aligned_contigs, myfile + '_aligned.txt')
    print '---------------END-------------------'

    # remove low confidence alignments
    print '---------------Remove low quality alignments---------------'
    for ref in qualify_alms:
        for qry in qualify_alms[ref]:
            x = qualify_alms[ref][qry]
            if x.confidence < min_confidence:
                removed[ref, qry] = True
                print 'alignment (', ref, ',', qry, ') is low quality and removed'
    num_alms = 0
    for ref in qualify_alms:
        for qry in qualify_alms[ref]:
            if removed[ref, qry] == False:
                num_alms += 1

    current_alms = copy_alms(qualify_alms, removed)
    output_alms(current_alms, output_dir + "/alms_1_removed_lowconf.log")
    print "After removing low confidence alignments, the number of alignments is", count_alms(
        current_alms)
    alms_1 = copy_alms(qualify_alms, removed)
    lowconf_contigs = different_contigs(alms_0, alms_1)
    output_contigs(lowconf_contigs, myfile + '_lowconf.txt')
    print '---------------End---------------'

    print '---------------removing false positive alignments-------------------'
    current_alms = copy_alms(qualify_alms, removed)
    false_alms(GLPSOL, false_alm_threshold, current_alms, removed, output_dir)
    current_alms = copy_alms(qualify_alms, removed)
    output_alms(current_alms, output_dir + "/alms_2_removed_false_alms.log")
    print "After removing false positive alignments, the number of alignments is", count_alms(
        current_alms)
    print '---------------END-------------------'

    print '---------------removing contained contigs locally-------------------'
    for ref in qualify_alms:
        for q1 in qualify_alms[ref]:
            x = qualify_alms[ref][q1]
            for q2 in qualify_alms[ref]:
                if q2 <= q1:
                    continue
                y = qualify_alms[ref][q2]
                if (x.start >= y.start) and (x.end <= y.end):
                    removed[ref, q1] = True
                    print[
                        ref, q1
                    ], "alignment is removed becasue it's contained in alignment", [
                        ref, q2
                    ]
                elif (y.start >= x.start) and (y.end <= x.end):
                    removed[ref, q2] = True
                    print[
                        ref, q2
                    ], "alignment is removed becasue it's contained in alignment", [
                        ref, q1
                    ]
    current_alms = copy_alms(qualify_alms, removed)
    output_alms(current_alms,
                output_dir + "/alms_3_removed_contained_locally.log")
    print "After removing contained alignments locally, the number of alignments is", count_alms(
        current_alms)
    print '---------------END-------------------'

    #build the mst
    print '---------------building the mst-------------------'
    fo = file(output_dir + "/ugraph_1.log", 'w')
    current_alms = copy_alms(qualify_alms, removed)
    forest, vertex_orientations = get_mst(current_alms, fo)
    fo.close()
    output_forest(forest, vertex_orientations, output_dir + "/forest_1.log")
    print '---------------END-------------------'
    # unify the coordinates
    print '---------------unifying the coordinates-------------------'
    current_alms = copy_alms(qualify_alms, removed)
    unify_alms = unify_coords(output_dir, current_alms, forest,
                              vertex_orientations)

    removed_unify = {}
    for root in unify_alms:
        for qry in unify_alms[root]:
            removed_unify[root, qry] = False

    contigs = set([])
    for root in unify_alms:
        for qry in unify_alms[root]:
            if qry in contigs:
                print qry, "appears in more than 1 trees"
            contigs.add(qry)
    current_alms = copy_alms(unify_alms, removed_unify)
    output_alms(current_alms, output_dir + "/alms_4_unified.log")
    print "After unifying the coordinates, the number of alignments is", count_alms(
        current_alms)
    print '---------------END-------------------'

    print '---------------removing contained contigs globally-------------------'
    contained = set([])
    for root in unify_alms:
        for q1 in unify_alms[root]:
            x = unify_alms[root][q1]
            for q2 in unify_alms[root]:
                if q2 <= q1:
                    continue
                y = unify_alms[root][q2]
                if (q2 not in contained) and (x.start >= y.start) and (x.end <=
                                                                       y.end):
                    contained.add(q1)
                    removed_unify[root, q1] = True
                    print[
                        root, q1
                    ], "alignment is removed becasue it's contained in alignment", [
                        root, q2
                    ]
                elif (q1 not in contained) and (y.start >=
                                                x.start) and (y.end <= x.end):
                    contained.add(q2)
                    removed_unify[root, q2] = True
                    print[
                        root, q2
                    ], "alignment is removed becasue it's contained in alignment", [
                        root, q1
                    ]
    for root in unify_alms:
        for qry in unify_alms[root]:
            if qry in contained and removed_unify[root, qry] == False:
                removed_unify[root, qry] = True
                print[root, qry
                      ], "alignment is removed because qry is contained contig"

    current_alms = copy_alms(unify_alms, removed_unify)
    output_alms(current_alms,
                output_dir + "/alms_5_removed_contained_globally.log")
    print "After removing contained contigs globally, the number of alignments is", count_alms(
        current_alms)
    print '---------------END-------------------'

    #build new mst
    print '---------------building new mst-------------------'
    fo = file(output_dir + "/ugraph_2.log", 'w')
    current_alms = copy_alms(unify_alms, removed_unify)
    forest_unify, vertex_orientations_unify = get_mst(current_alms, fo)
    fo.close()
    output_forest(forest_unify, vertex_orientations_unify,
                  output_dir + "/forest_2.log")
    print '---------------END-------------------'

    print '---------------merging DAGs-------------------'
    current_alms = copy_alms(unify_alms, removed_unify)
    DAGs = merge_DAGs(current_alms, forest_unify, vertex_orientations_unify)
    output_DAGs(DAGs, output_dir + "/dags.log")
    print '---------------END-------------------'

    #DAG to mtp contig set
    print '---------------mtp-------------------'
    mtp_node_set = get_subDAGs(DAGs, output_dir)
    current_alms = copy_alms(unify_alms, removed_unify)
    mtp = []
    for ref in current_alms:
        for qry in current_alms[ref]:
            x = current_alms[ref][qry]
            if qry in mtp_node_set:
                mtp.append(x)
            else:
                removed_unify[ref, qry] = True
    mtp.sort(key=lambda x: (x.ref, x.start))
    print "In total, the number of alignments in mtp is", len(mtp)

    current_alms = copy_alms(unify_alms, removed_unify)
    output_alms(current_alms, output_dir + "/alms_6_mtp.log")
    print '---------------END-------------------'

    print '---------------scaling-------------------'
    # calculating scaling
    qry_len = {}
    with open(myfile2 + '_key.txt') as f_key:
        for i in range(0, 4):  # 4 header lines
            f_key.readline()
        for line in f_key:
            line = line.strip()
            cols = line.split('\t')
            qry_id = int(cols[0])
            seq_len = int(cols[2])
            qry_len[qry_id] = seq_len
    scaling = 0
    num = 0
    with open(myfile + '_q.cmap') as f_q:
        for i in range(0, 11):  # 11 header lines
            f_q.readline()
        for line in f_q:
            line = line.strip()
            cols = line.split('\t')
            qry_id = int(cols[0])
            appr_len = float(cols[1])
            seq_len = qry_len[qry_id]
            scaling += appr_len / seq_len
            num += 1
    scaling /= num  # scaling=1.02258059775

    print '---------------outputing-------------------'
    # save the MTP in a new xmap file and count the number of unitigs in each assembly
    with open(myfile + '_list.txt', 'wb') as listfile:
        with open(myfile + '_mtp.xmap', 'wb') as csvfile:
            csvwriter = csv.writer(csvfile, delimiter='\t')
            # copies the old xmap header
            for x in header:
                csvwriter.writerow(x)
            i = 1  # progressive number
            # for steve ->
            #	    scaling = 1.02257561752017878915 # scaling fact from opt map to BP
            previous = 0  # previous qry contig, to remove dups
            for x in mtp:
                # save the contig in listfile only if it is a new one
                if (x.qry != previous):
                    #listfile.write(str(x.qry)+'\n')
                    previous = x.qry

# for steve ->
                listfile.write(
                    str(x.ref) + '\t' + str(x.qry) + '\t' +
                    str(int(round(float(x.start) / scaling))) + '\t' +
                    str(int(round(float(x.end) / scaling))) + '\t' +
                    x.orientation + '\n')
                # dump the alignment
                csvwriter.writerow([i] + x.unpack())
                i += 1
    csvfile.close()
    listfile.close()
Esempio n. 22
0
    def load_args(self):
        parser = argparse.ArgumentParser(
            formatter_class=argparse.RawTextHelpFormatter)
        parser.add_argument("-a",
                            "--alignment",
                            required=True,
                            help="\nEnter alignment file.\n\n")

        parser.add_argument("-s",
                            "--symbol",
                            required=True,
                            choices=["protein", "nucleotide"],
                            help="\nEnter type of alignment being inputted."
                            "\n\n")

        parser.add_argument("-m",
                            "--matrix",
                            required=True,
                            help="\nSelect similarity matrix.\n\n"
                            "| PRESETS: blosum62, blosum90, blosum100,\n"
                            "pam100, pam250, binary.\n\n"
                            "| NOTE: If a preset is not chosen the program\n"
                            "will assume you are loading a file. Make sure\n"
                            "the file you load follows the standard format\n"
                            "set by PAM and BLOSUM.\n\n"
                            "| NOTE: Binary matrices ignore any\n"
                            "similarities among disparate symbols.\n\n"
                            "| NOTE: Nucelotide alignments that are not\n"
                            "converted must use a binary matrix."
                            "\n\n")

        parser.add_argument("-d",
                            "--distribution",
                            required=True,
                            help="\nSelect sequences to define the\n"
                            "background distribution.\n\n"
                            "| PRESETS: self, swiss-prot\n\n"
                            "| NOTE: If a preset is not chosen the program\n"
                            "will assume you are loading a file. Make sure\n"
                            "any file you load follows fasta formatting\n"
                            "and is of the same symbol type (protein or \n"
                            "nucleotide).\n\n"
                            "| NOTE: If self is chosen the sequences\n"
                            "of the inputted alignment file will be used\n"
                            "for the calculation.\n\n")

        parser.add_argument("-c",
                            "--convert",
                            required=True,
                            choices=["yes", "no"],
                            help="\nConvert nucleotide alignment to protein\n"
                            "alignment?\n\n"
                            "| NOTE: Sequences in alignment must be of equal\n"
                            "length to convert.\n\n"
                            "|NOTE: First open reading frame is used.")
        args = parser.parse_args()

        matrix_input = args.matrix
        if args.symbol == "nucleotide" and args.convert == "no":
            if args.distribution == "swiss-prot":
                exit("Nucleotide alignment cannot use amino acid background "
                     "distribution")
            if matrix_input != "binary":
                exit("Nucleotide alignments that are not converted must use a "
                     "binary matrix")
            matrix_input = "binary-nucleotide"
        else:
            if args.symbol == "protein" and args.convert == "yes":
                exit(
                    "Cannot convert protein alignment to nucleotide alignment")
            if matrix_input == "binary":
                matrix_input = "binary-protein"
        sim_matrix = Matrix(matrix_input)

        return Alignment(args.alignment, args.symbol, args.distribution,
                         sim_matrix, args.convert)
Esempio n. 23
0
def calculate_chunk(h, ref, beam_size):
    a = Alignment(h, ref)
    line1UsedWords = [False for _ in range(len(a.line1))]
    line2UsedWords = [False for _ in range(len(a.line2))]
    initialPath = PartialAlignment([None for _ in range(len(a.line2))],
                                   line1UsedWords, line2UsedWords)

    # init all possible matches
    for i in range(len(a.line1)):
        for j in range(len(a.line2)):
            if is_similar_word(a.line1[i], a.line2[j]):
                a.matches[j].append(
                    Match(start=j,
                          length=1,
                          matchStart=i,
                          matchLength=1,
                          prob=1))
                a.line1Coverage[i] += 1
                a.line2Coverage[j] += 1

    # One-to-one, non-overlapping matches are definite
    for i in range(len(a.matches)):
        if len(a.matches[i]) == 1:
            m = a.matches[i][0]
            overlap = False
            if (a.line2Coverage[i] != 1):
                overlap = True
            if (a.line1Coverage[m.matchStart] != 1):
                overlap = True
            if not overlap:
                initialPath.matches[i] = m
                initialPath.line2UsedWords[i] = True
                initialPath.line1UsedWords[m.matchStart] = True

    # Resolve best alignment using remaining matches
    paths = []
    nextPaths = [initialPath]
    for current in range(len(a.matches) + 1):
        paths = nextPaths
        nextPaths = []
        paths.sort(key=functools.cmp_to_key(compare))
        # print(paths)
        # Try as many paths as beam allows
        numRank = min(beam_size, len(paths))
        for rank in range(numRank):
            path = paths[rank]
            # Case: path is complete
            if current == len(a.matches):
                # Close last chunk
                if path.lastMatchEnd != -1:
                    path.chunks += 1
                nextPaths.append(path)
                continue
            # Case: Current index word is in use
            if path.line2UsedWords[current] is True:
                # If fixed match
                if path.matches[path.idx] != None:
                    m = path.matches[path.idx]
                    path.matchCount += 1
                    path.matches1 += 1
                    path.matches2 += 1
                    # Not continuous in line1
                    if path.lastMatchEnd != -1 and m.matchStart != path.lastMatchEnd:
                        path.chunks += 1
                    # Advance to end of match + 1
                    path.idx = m.start + 1
                    path.lastMatchEnd = m.matchStart + 1
                    path.distance += abs(m.start - m.matchStart)
                    nextPaths.append(path)
                continue
            # Case: Multiple possible matches, for each match starting at index start
            matches = a.matches[current]
            for i in range(len(matches)):
                m = matches[i]
                # Check to see if words are unused
                if path.isUsed(m):
                    continue
                newPath = copy.deepcopy(path)
                # Select m for this start index
                newPath.setUsed(m, True)
                newPath.matches[current] = m
                # Calculate new stats
                newPath.matchCount += 1
                newPath.matches1 += 1
                newPath.matches2 += 1
                # Not continuous in line1
                if newPath.lastMatchEnd != -1 and m.matchStart != newPath.lastMatchEnd:
                    newPath.chunks += 1
                # Advance to end of match + 1
                newPath.idx = m.start + 1
                newPath.lastMatchEnd = m.matchStart + 1
                newPath.distance += abs(m.start - m.matchStart)
                nextPaths.append(newPath)
            # Try skipping this index
            if path.lastMatchEnd != -1:
                path.chunks += 1
                path.lastMatchEnd = -1
            path.idx += 1
            nextPaths.append(path)
        if len(nextPaths) == 0:
            print(
                "Warning: unexpected conditions - skipping matches until possible to continue"
            )
            nextPaths.append(paths[0])
    # Return top best path's chunk number
    nextPaths.sort(key=functools.cmp_to_key(compare))
    return nextPaths[0].chunks, nextPaths[0].matchCount
Esempio n. 24
0
from google.appengine.ext import ndb

import jinja2
import webapp2

JINJA_ENVIRONMENT = jinja2.Environment(
    loader=jinja2.FileSystemLoader(os.path.dirname(__file__)),
    extensions=['jinja2.ext.autoescape'],
    autoescape=True)
# [END imports]

DEFAULT_GUESTBOOK_NAME = 'default_guestbook'



aclass = Alignment()

# [START main_page]
class MainPage(webapp2.RequestHandler):

    def get(self):
        sequence_1 = self.request.get('sequence_1')
        sequence_2 = self.request.get('sequence_2')
        match = self.request.get('match')
        mismatch = self.request.get('mismatch')
        gap = self.request.get('gap')
        alignment = self.request.get('alignment')
        
        print sequence_1, sequence_2, match, mismatch, gap, alignment

        if 'global' in alignment.lower():
Esempio n. 25
0
def traceback_process(
    scoring_matrix: List[List[Cell]], seq1: str, seq2: str, starting_cell: Cell
) -> Alignment:
    """This method computes the traceback process for retrieving an alignment
    Args:
        scoring_matrix (List[List[Cell]]): the scoring matrix
        seq1 (str): the first sequence
        seq2 (str): the second sequence
        starting_cell (Cell): the starting cell for the traceback process
    Returns:
        The alignment starting from starting_cell
    """
    subseq1, subseq2 = "", ""
    max_gap_length = 0
    min_gap_length = max(len(seq1), len(seq2))
    n_gaps = 0
    tmp_gap = None
    gap_direction = None
    actual_cell = starting_cell

    while actual_cell.score > 0:
        i, j = actual_cell.indices
        seq_i = seq1[i - 1]
        seq_j = seq2[j - 1]

        if actual_cell.origin == Move.DIAGONAL:
            if tmp_gap is not None:
                # If there was a gap end it and update counts
                max_gap_length = max(max_gap_length, tmp_gap)
                min_gap_length = min(min_gap_length, tmp_gap)
                n_gaps += 1
                tmp_gap = None
                gap_direction = None

            subseq1 += seq_i
            subseq2 += seq_j
            actual_cell = scoring_matrix[i - 1][j - 1]
        elif actual_cell.origin == Move.HORIZONTAL:
            if gap_direction == Move.HORIZONTAL:
                tmp_gap += 1
            else:
                if tmp_gap is not None:
                    max_gap_length = max(max_gap_length, tmp_gap)
                    min_gap_length = min(min_gap_length, tmp_gap)
                    n_gaps += 1
                tmp_gap = 1
                gap_direction = Move.HORIZONTAL

            subseq1 += "-"
            subseq2 += seq_j
            actual_cell = scoring_matrix[i][j - 1]
        elif actual_cell.origin == Move.VERTICAL:
            if gap_direction == Move.VERTICAL:
                tmp_gap += 1
            else:
                if tmp_gap is not None:
                    max_gap_length = max(max_gap_length, tmp_gap)
                    min_gap_length = min(min_gap_length, tmp_gap)
                    n_gaps += 1
                tmp_gap = 1
                gap_direction = Move.VERTICAL

            subseq1 += seq_i
            subseq2 += "-"
            actual_cell = scoring_matrix[i - 1][j]
        else:
            raise Exception(
                f'Something went wrong origin must be one of {",".join([move for move in Move])}'
            )

    if tmp_gap is not None:
        max_gap_length = max(max_gap_length, tmp_gap)
        min_gap_length = min(min_gap_length, tmp_gap)
        n_gaps += 1

    return Alignment(
        subseq1[::-1],
        subseq2[::-1],
        max_gap_length,
        min_gap_length,
        n_gaps,
        starting_cell.score,
        starting_cell.indices,
    )
Esempio n. 26
0
import cv2
import sys
import numpy as np
import datetime
sys.path.append('../SSH')
sys.path.append('../alignment')
from ssh_detector import SSHDetector
from alignment import Alignment
from embedding import Embedding

#short_max = 800
scales = [1200, 1600]
t = 2

detector = SSHDetector('../SSH/model/e2ef', 0)
alignment = Alignment('../alignment/model/3d_I5', 12)
embedding = Embedding('./model/model', 0)
out_filename = './out.png'

f = '../sample-images/t1.jpg'
if len(sys.argv) > 1:
    f = sys.argv[1]
img = cv2.imread(f)
im_shape = img.shape
print(im_shape)
target_size = scales[0]
max_size = scales[1]
im_size_min = np.min(im_shape[0:2])
im_size_max = np.max(im_shape[0:2])
if im_size_min > target_size or im_size_max > max_size:
    im_scale = float(target_size) / float(im_size_min)
Esempio n. 27
0
import sys
from alignment import Alignment

file = sys.argv[1]
#file = 'test_files/test.txt'
args = open(file).readlines()
flag = args[0].rstrip()
scores = args[1].split()
match = int(scores[0])
mismatch = int(scores[1])
indel = int(scores[2])
seq1= args[2].rstrip()
seq2= args[3].rstrip()

if flag == 'g':
    a = Alignment(match,mismatch,indel,seq1,seq2)
    a.single_global_single_align()
    a.report_optimal_score()
elif flag == 'l':
    a = Alignment(match,mismatch,indel,seq1,seq2)
    a.local_single_align()
    a.report_optimal_score()
else:
    print "Invalid alignment flag."

file = open('results.txt', "w")
file.write("Score:")
file.write(a.get_optimal_score())
file.write("\n")
file.write("Number of Optimal Alignments:")
file.write(a.get_total_optimal_alignments())
Esempio n. 28
0
def chipseq_genome():
    args = get_args()
    if args.o is None:
        args.o = str(pathlib.Path.cwd())

    # prep-dirs
    # subdirs = ['genome_mapping', 'count', 'bigWig', 'report', 'src']
    prj_path = prepare_project(args.o)

    # path_out
    #    |-genome_mapping
    #    |-bigWig
    #    |-macs2_output
    #    |-transposon_analysis
    #    |-src

    ## Alignment ##

    # control
    ctl_fqs = [f.name for f in args.c]
    if args.C is None:
        ctl_prefix = str_common([os.path.basename(f) for f in ctl_fqs])
        ctl_prefix = ctl_prefix.rstrip('r|R|rep|Rep').rstrip('_|.')
        args.C = ctl_prefix
    ctl_path = os.path.join(prj_path['genome_mapping'], args.C)
    ctl_bam_files, ext_ctl_bam_files = Alignment(
        ctl_fqs,
        ctl_path,
        smp_name=args.C,
        genome=args.g,
        spikein=None,
        index_ext=args.x,
        threads=args.threads,
        unique_only=True,
        n_map=1,
        aligner=args.aligner,
        align_to_rRNA=True,
        merge_rep=False,
        path_data=args.path_data,
        overwrite=args.overwrite).run()

    # treatment
    tre_fqs = [f.name for f in args.t]
    if args.T is None:
        tre_prefix = str_common([os.path.basename(f) for f in tre_fqs])
        tre_prefix = tre_prefix.rstrip('r|R|rep|Rep').rstrip('_|.')
        args.T = tre_prefix
    tre_path = os.path.join(prj_path['genome_mapping'], args.T)
    tre_bam_files, ext_tre_bam_files = Alignment(
        tre_fqs,
        tre_path,
        smp_name=args.T,
        genome=args.g,
        spikein=None,
        index_ext=args.x,
        threads=args.threads,
        unique_only=True,
        n_map=1,
        aligner=args.aligner,
        align_to_rRNA=True,
        merge_rep=False,
        path_data=args.path_data,
        overwrite=args.overwrite).run()

    # ## mapping stat ##
    # map_stat_path = prj_path['report']
    # map_stat_file = os.path.join(map_stat_path, 'mapping.stat')
    # ctl_map = map_stat(ctl_path)
    # tre_map = map_stat(tre_path)
    # df_map = pd.concat([ctl_map, tre_map], axis=0).reset_index()
    # df_map = df_map.sort_values(['index'])
    # df_map.to_csv(map_stat_file, sep='\t', header=True, index=False)

    ################
    ## call peaks ##
    ################
    ## for each replicates
    for tre_bam in tre_bam_files:
        i = tre_bam_files.index(tre_bam)
        if i >= len(ctl_bam_files):
            i = 0  # the first one
        ctl_bam = ctl_bam_files[i]
        # output directory
        tre_bam_prefix = file_prefix(tre_bam)[0]
        tre_bam_path = os.path.join(prj_path['macs2_output'], tre_bam_prefix)
        p = Macs2(ip=tre_bam,
                  control=ctl_bam,
                  genome=args.g,
                  output=tre_bam_path,
                  prefix=tre_bam_prefix)
        # call peaks
        p.callpeak()
        p.bdgcmp(opt='ppois')
        p.bdgcmp(opt='FE')
        p.bdgcmp(opt='logLR')
        # annotate peaks
        p.broadpeak_annotation()

    ###################
    ## create bigWig ##
    ###################
    # map_bam_files = ctl_bam_files + tre_bam_files
    # bw_path = prj_path['bigWig']
    # for bam in map_bam_files:
    #     bam2bigwig(
    #         bam=bam,
    #         genome=args.g,
    #         path_out=bw_path,
    #         strandness=args.s,
    #         binsize=args.bin_size,
    #         overwrite=args.overwrite)

    #########################
    ## transposon analysis ##
    #########################
    if ext_tre_bam_files is None or ext_ctl_bam_files is None:
        logging.info('transposon analysis skipped')
    else:
        if isinstance(ctl_bam_files, list) and isinstance(tre_bam_files, list):
            # fetch the scale
            te_path = prj_path['transposon_analysis']
            for i in ext_tre_bam_files:
                i_index = ext_tre_bam_files.index(i)
                # genome mapping BAM
                ext_tre_bam = i[0]
                tre_bam = tre_bam_files[i_index]
                if i_index >= len(ext_ctl_bam_files):
                    i_index = 0
                ext_ctl_bam = ext_ctl_bam_files[i_index][0]
                ctl_bam = ctl_bam_files[i_index]
                # fetch the normalize scale
                tre_bam_prefix = file_prefix(tre_bam)[0]
                tre_bam_path = os.path.join(prj_path['macs2_output'],
                                            tre_bam_prefix)
                p = Macs2(ip=tre_bam,
                          control=ctl_bam,
                          genome=args.g,
                          output=tre_bam_path,
                          prefix=tre_bam_prefix)
                d = p.get_effect_size(
                )  # ip_scale, ip_depth, input_scale, input_depth

                # bam to bigWig
                te_sub_path = os.path.join(te_path, tre_bam_prefix)
                bam2bigwig2(ext_tre_bam,
                            te_sub_path,
                            scale=d['ip_scale'],
                            overwrite=args.overwrite)
                bam2bigwig2(ext_ctl_bam,
                            te_sub_path,
                            scale=d['input_scale'],
                            overwrite=args.overwrite)

                # save scale to file
                s1 = os.path.join(te_sub_path, 'scale.pickle')
                s2 = os.path.join(te_sub_path, 'scale.lib')
                args_checker(d, s1)
                args_logger(d, s2)

                # create coverage plots
                pdf_out = os.path.join(te_sub_path,
                                       tre_bam_prefix + '.track_view.pdf')
                fasize = 'abc.fa'
                bigwig2track_single(ext_tre_bam, ext_ctl_bam, fasize, 'P5',
                                    pdf_out)
Esempio n. 29
0
import cv2
import sys
import numpy as np
import datetime
from alignment import Alignment
sys.path.append('../SSH')
from ssh_detector import SSHDetector

#short_max = 800
scales = [1200, 1600]
t = 2

detector = SSHDetector('../SSH/model/e2ef', 0)
alignment = Alignment('./model/3d_I5', 12)
out_filename = './out.png'

f = '../sample-images/t1.jpg'
if len(sys.argv)>1:
  f = sys.argv[1]
img = cv2.imread(f)
im_shape = img.shape
print(im_shape)
target_size = scales[0]
max_size = scales[1]
im_size_min = np.min(im_shape[0:2])
im_size_max = np.max(im_shape[0:2])
if im_size_min>target_size or im_size_max>max_size:
  im_scale = float(target_size) / float(im_size_min)
  # prevent bigger axis from being more than max_size:
  if np.round(im_scale * im_size_max) > max_size:
      im_scale = float(max_size) / float(im_size_max)
Esempio n. 30
0
    def read_alignments(self,
                        input_dir,
                        file_format,
                        header_grep=None,
                        compression=None):
        """ Get list of alignment files from an input directory *.fa, *.fas and
        *.phy files only

        Stores in self.files """

        optioncheck(compression, [None, 'gz', 'bz2'])

        if file_format == 'fasta':
            extensions = ['fa', 'fas', 'fasta']

        elif file_format == 'phylip':
            extensions = ['phy']

        else:
            extensions = []

        if compression:
            extensions = ['.'.join([x, compression]) for x in extensions]

        files = fileIO.glob_by_extensions(input_dir, extensions)
        files.sort(key=SORT_KEY)
        self._input_files = files
        records = []

        pbar = setup_progressbar("Loading files",
                                 len(files),
                                 simple_progress=True)
        pbar.start()

        for i, f in enumerate(files):
            if compression is not None:
                with fileIO.TempFile() as tmpfile:
                    with fileIO.freader(f,
                                        compression) as reader, fileIO.fwriter(
                                            tmpfile) as writer:
                        for line in reader:
                            writer.write(line)
                    try:
                        record = Alignment(tmpfile, file_format, True)
                    except RuntimeError:
                        record = Alignment(tmpfile, file_format, False)

            else:
                try:
                    record = Alignment(f, file_format, True)
                except RuntimeError:
                    record = Alignment(f, file_format, False)

            if header_grep:
                try:
                    datatype = 'dna' if record.is_dna() else 'protein'

                    record = Alignment([(header_grep(x), y)
                                        for (x, y) in record.get_sequences()],
                                       datatype)

                except TypeError:
                    raise TypeError("Couldn't apply header_grep to header\n"
                                    "alignment number={}, name={}\n"
                                    "header_grep={}".format(
                                        i, fileIO.strip_extensions(f),
                                        header_grep))
                except RuntimeError:
                    print(
                        'RuntimeError occurred processing alignment number={}, name={}'
                        .format(i, fileIO.strip_extensions(f)))
                    raise

            record.name = (fileIO.strip_extensions(f))
            records.append(record)
            pbar.update(i)
        pbar.finish()
        return records