def make_alignment(self, cfg, alignment):
        # Make an Alignment from the source, using this subset
        sub_alignment = SubsetAlignment(alignment, self)
        sub_path = os.path.join(cfg.phyml_path, self.name + '.phy')
        # Add it into the sub, so we keep it around
        self.alignment_path = sub_path

        # Maybe it is there already?
        if os.path.exists(sub_path):
            log.debug("Found existing alignment file %s", sub_path)
            old_align = Alignment()
            old_align.read(sub_path)

            # It had better be the same!
            if not old_align.same_as(sub_alignment):
                log.error(
                    "It looks like you have changed one or more of the "
                    "data_blocks in the configuration file, "
                    "so the new subset alignments "
                    "don't match the ones stored for this analysis. "
                    "You'll need to run the program with --force-restart")
                raise SubsetError
        else:
            # We need to write it
            sub_alignment.write(sub_path)
Example #2
0
    def make_alignment(self, source_alignment_path):
        # Make the alignment
        self.alignment = Alignment()
        self.alignment.read(source_alignment_path)

        # TODO REMOVE -- this should be part of the checking procedure
        # We start by copying the alignment
        self.alignment_path = os.path.join(the_config.start_tree_path, 'source.phy')
        if os.path.exists(self.alignment_path):
            # Make sure it is the same
            old_align = Alignment()
            old_align.read(self.alignment_path)
            if not old_align.same_as(self.alignment):
                log.error("""Alignment file has changed since previous run. You
                     need to use the force-restart option.""")
                raise AnalysisError

            compare = lambda x, y: collections.Counter(x) == collections.Counter(y)

            if not compare(old_align.species, self.alignment.species):
                log.error("""Species names in alignment have changed since previous run. You
                     need to use the force-restart option.""")
                raise AnalysisError


        else:
            self.alignment.write(self.alignment_path)
Example #3
0
    def permuted_copy(self, partition=None):
        """ Return a copy of the collection with all alignment columns permuted
        """
        def take(n, iterable):
            return [iterable.next() for _ in range(n)]

        if partition is None:
            partition = Partition([1] * len(self))

        index_tuples = partition.get_membership()

        alignments = []
        for ix in index_tuples:
            concat = Concatenation(self, ix)
            sites = concat.alignment.get_sites()
            random.shuffle(sites)
            d = dict(zip(concat.alignment.get_names(), [iter(x) for x in zip(*sites)]))
            new_seqs = [[(k, ''.join(take(l, d[k]))) for k in d] for l in concat.lengths]

            for seqs, datatype, name in zip(new_seqs, concat.datatypes, concat.names):
                alignment = Alignment(seqs, datatype)
                alignment.name = name
                alignments.append(alignment)

        return self.__class__(records=sorted(alignments, key=lambda x: SORT_KEY(x.name)))
Example #4
0
    def permuted_copy(self, partition=None):
        """ Return a copy of the collection with all alignment columns permuted
        """
        def take(n, iterable):
            return [iterable.next() for _ in range(n)]

        if partition is None:
            partition = Partition([1] * len(self))

        index_tuples = partition.get_membership()

        alignments = []
        for ix in index_tuples:
            concat = Concatenation(self, ix)
            sites = concat.alignment.get_sites()
            random.shuffle(sites)
            d = dict(
                zip(concat.alignment.get_names(),
                    [iter(x) for x in zip(*sites)]))
            new_seqs = [[(k, ''.join(take(l, d[k]))) for k in d]
                        for l in concat.lengths]

            for seqs, datatype, name in zip(new_seqs, concat.datatypes,
                                            concat.names):
                alignment = Alignment(seqs, datatype)
                alignment.name = name
                alignments.append(alignment)

        return self.__class__(
            records=sorted(alignments, key=lambda x: SORT_KEY(x.name)))
Example #5
0
def parse_input(target_sequence: str, input_fasta_file: str) -> None:
    """
    parse_input

    :param target_sequence: The target sequence as a string.
    :param input_fasta_file: The path to a FASTA file.
    """

    if not Helpers.valid_dna_sequence(target_sequence):
        raise click.UsageError(
            'The target sequence is not a valid DNA sequence.')

    input_sequence = Helpers.fasta_file_to_sequence(input_fasta_file)
    if not input_sequence:
        raise click.UsageError('Passed input file not in FASTA format.')

    optimal_alignment_finder = OptimalAlignmentFinder(target_sequence,
                                                      input_sequence)

    # If the results directory doesn't exist, create it.
    if not os.path.exists(RESULTS_DIRECTORY):
        os.makedirs(RESULTS_DIRECTORY)

    Alignment.save_alignments(f'{RESULTS_DIRECTORY}forward_alignments.json',
                              optimal_alignment_finder.forward_alignments)
    Histogram.save_histograms(optimal_alignment_finder.forward_alignments,
                              True)

    Alignment.save_alignments(
        f'{RESULTS_DIRECTORY}reverse_complement_alignments.json',
        optimal_alignment_finder.reverse_complement_alignments)
    Histogram.save_histograms(
        optimal_alignment_finder.reverse_complement_alignments, False)
Example #6
0
    def make_alignment(self, cfg, alignment):
        # Make an Alignment from the source, using this subset
        sub_alignment = SubsetAlignment(alignment, self)
        sub_path = os.path.join(cfg.phylofiles_path, self.name + '.phy')
        # Add it into the sub, so we keep it around
        self.alignment_path = sub_path

        # Maybe it is there already?
        if os.path.exists(sub_path):
            log.debug("Found existing alignment file %s", sub_path)
            old_align = Alignment()
            old_align.read(sub_path)

            # It had better be the same!
            if not old_align.same_as(sub_alignment):
                log.error(
                    "It looks like you have changed one or more of the "
                    "data_blocks in the configuration file, "
                    "so the new subset alignments "
                    "don't match the ones stored for this analysis. "
                    "You'll need to run the program with --force-restart")
                raise SubsetError
        else:
            # We need to write it
            sub_alignment.write(sub_path)
Example #7
0
    def execute(self):

        # Alignment
        # TODO: choose mode automatically
        msa = Alignment(messages=self.messages,
                        output_dir=self.output_dir,
                        mode=self.mode,
                        multithread=self.multithread)
        #msa = Alignment(messages=self.messages, output_dir=self.output_dir, multithread=True)
        msa.execute()
        # exit()

        # Generate fields
        filepath_fields_info = os.path.join(self.output_dir,
                                            Alignment.FILENAME_FIELDS_INFO)
        self.fields, fid_list = self.generate_fields_by_fieldsinfo(
            filepath_fields_info)
        logging.debug("Number of keyword candidates: {}\nfid: {}".format(
            len(fid_list), fid_list))

        # Compute probabilities of observation constraints
        constraint = Constraint(messages=self.messages,
                                direction_list=self.direction_list,
                                fields=self.fields,
                                fid_list=fid_list,
                                output_dir=self.output_dir)

        pairs_p, pairs_size = constraint.compute_observation_probabilities()
        pairs_p_request, pairs_p_response = pairs_p
        pairs_size_request, pairs_size_response = pairs_size
        constraint.save_observation_probabilities(pairs_p_request,
                                                  pairs_size_request,
                                                  Constraint.TEST_TYPE_REQUEST)
        constraint.save_observation_probabilities(
            pairs_p_response, pairs_size_response,
            Constraint.TEST_TYPE_RESPONSE)

        # pairs_p_request, pairs_size_request = constraint.load_observation_probabilities(Constraint.TEST_TYPE_REQUEST)
        # pairs_p_response, pairs_size_response = constraint.load_observation_probabilities(Constraint.TEST_TYPE_RESPONSE)
        # print(pairs_p_request, pairs_size_request)
        # print(pairs_p_response, pairs_size_response)

        # Probabilistic inference
        pairs_p_all, pairs_size_all = self.merge_constraint_results(
            pairs_p_request, pairs_p_response, pairs_size_request,
            pairs_size_response)

        ffid_list = ["{0}-{0}".format(fid)
                     for fid in fid_list]  #only test same fid for both sides
        pi = ProbabilisticInference(pairs_p=pairs_p_request,
                                    pairs_size=pairs_size_request)
        fid_inferred = pi.execute(ffid_list)

        ## TODO: iterative
        ## TODO: format inference

        return fid_inferred
Example #8
0
 def get_results(self):
     if self.file_read_job == None:
         return self.results
     else:
         # self.results=read_internal_alignment(self.alignedfn,)
         alignment = Alignment()
         alignment.datatype = self.datatype
         alignment.read_filepath(self.alignedfn, file_format='FASTA')
         self.results = alignment
         return self.results
Example #9
0
    def __init__(self, gui, parent=None):
        """
        Establish the connection with the main gui, set some instance variables and initialize all
        flags to False.

        :param gui: main gui object
        """

        QtCore.QThread.__init__(self, parent)
        self.gui = gui

        # Create the alignment object. Alignment points are kept throughout the whole program
        # execution, even if the telescope driver or other configuration parameters are changed.
        self.al = Alignment(self.gui.configuration, debug=self.gui.configuration.alignment_debug)

        self.exiting = False

        self.output_channel_initialization_flag = False
        self.telescope_initialization_flag = False
        self.camera_initialization_flag = False
        self.new_tesselation_flag = False
        self.slew_to_alignment_point_flag = False
        self.perform_alignment_flag = False
        self.perform_autoalignment_flag = False
        self.slew_to_moon_limb_flag = False
        self.set_focus_area_flag = False
        self.goto_focus_area_flag = False
        self.slew_to_tile_and_record_flag = False
        self.move_to_selected_tile_flag = False
        self.escape_pressed_flag = False

        # Save the descriptor of standard output. Stdout might be redirected to a file and back
        # later.
        self.stdout_saved = sys.stdout

        # Initialize status variables.
        self.output_redirected = False
        self.telescope_connected = False
        self.camera_connected = False
        self.tesselation_created = False

        # Initialize some instance variables.
        self.active_tile_number = -1
        self.all_tiles_recorded = False
        self.protocol_file = None
        self.telescope = None
        self.camera = None
        self.date_time = None
        self.me = None
        self.tc = None
        self.repeat_from_here = None
        self.tile_indices_since_last_autoalign = None

        self.start()
Example #10
0
def read_internal_alignment(fn, file_format="FASTA", datatype=None, dirs_to_delete=(), temp_fs=None):
    alignment = Alignment()
    alignment.datatype = datatype
    alignment.read_filepath(fn, file_format=file_format)
    if len(alignment) >= 1:
        if dirs_to_delete:
            assert temp_fs
            for d in dirs_to_delete:
                time.sleep(1)  # TODO: not sure why this is here!
                temp_fs.remove_dir(d)
        return alignment
    else:
        raise ValueError("The alignment file has no sequences. SATe quits." % fn)
Example #11
0
    def make_alignment(self, source_alignment_path):
        # Make the alignment
        self.alignment = Alignment()
        self.alignment.read(source_alignment_path)
        # We start by copying the alignment
        self.alignment_path = os.path.join(self.cfg.start_tree_path, 'source.phy')
        if os.path.exists(self.alignment_path):
            # Make sure it is the same
            old_align = Alignment()
            old_align.read(self.alignment_path)
            if not old_align.same_as(self.alignment):
                log.error("Alignment file has changed since previous run. You need to use the force-restart option.")
                raise AnalysisError

        else:
            self.alignment.write(self.alignment_path)
Example #12
0
 def build_expr(self, context, expr, filter=None, align=None):
     score_expr = LogitScore(expr)
     if align is not None:
         # we do not need add_filter because Alignment already handles it
         return Alignment(score_expr, align, filter=filter)
     else:
         return self.add_filter(ComparisonOp('>', score_expr, 0.5), filter)
    def generate_xml_tree(self):
        """
        Try to parse xml, generate tree with xml tags and then cast it to mainAligment object and Alignment
        :return: exception when file has't got correct content
        """
        try:
            tree = et.parse(self.file)
            self.root = tree.getroot()
            self.blast_output = self.root[8]
            self.iteration = self.blast_output[0]
            self.iteration_hit = self.iteration[4]

            for i in self.iteration_hit:
                self.hits.append(i)

            for i in self.hits:
                h = []
                for j in i:
                    h.append(j)

                for hsp in h[5]:
                    procent = "{0:.2f}".format(
                        int(hsp[10].text) / int(hsp[13].text) * 100)
                    procent = float(procent)
                    self.aligns.append(
                        Alignment(h[2].text, hsp[1].text, procent,
                                  hsp[12].text, hsp[10].text, hsp[13].text,
                                  hsp[14].text, hsp[15].text, hsp[16].text))
                self.main_alignments.append(
                    MainAlignment(i[2].text, self.aligns))
                self.aligns = []
        except IndexError:
            "Bad file."
Example #14
0
    def __init__(self, line_string):
        self.type = self.TYPE_HEADER if line_string.startswith('@') \
                else self.TYPE_ALIGNMENT

        if self.type == self.TYPE_HEADER:
            self.fields = [line_string]
            return

        self.fields = line_string.split()
        pos, cigar = self.fields[3], self.fields[5]

        if cigar == '*':
            raise CigarUnavailableError

        md = next(filter(lambda field: field.startswith('MD:Z:'), self.fields))
        md = md.replace('MD:Z:', '')
        self.alignment = Alignment(pos, cigar, md)
Example #15
0
def find_single_unique(alns, bam, debug=False):
    """Extracts single unique alignment for indel detection
    If there is only one alignment reported by BWA-mem even when '-a' is turned on
    
    Args:
        alns: (list) Pysam AlignedRead objects of the same contig
        bam: Pysam bam handle
    Returns:
        Alignment object or None
    """
    primary_alns = [
        aln for aln in alns if not aln.is_unmapped and not aln.is_secondary
    ]
    if len(primary_alns) == 1:
        if primary_alns[0].mapq > 0:
            matched_and_insertion_len = sum(
                [a[1] for a in primary_alns[0].cigar if a[0] <= 1])
            if float(matched_and_insertion_len) / float(
                    primary_alns[0].rlen) < 0.95:
                if debug:
                    sys.stdout.write(
                        'best alignment less than 0.95 mapped:%s %s\n' %
                        (alns[0].qname, alns[0].cigarstring))
                return None

            else:
                edit_distance = effective_edit_distance(alns[0])
                if edit_distance is not None and float(edit_distance) / float(
                        primary_alns[0].inferred_length) > 0.1:
                    if debug:
                        sys.stdout.write(
                            'filter out single uniq alignment %s: edit distance %s - > 0.1 of contig len %d (%.01f)\n'
                            % (alns[0].qname, edit_distance,
                               primary_alns[0].inferred_length,
                               float(edit_distance) /
                               float(primary_alns[0].inferred_length)))
                    return None

        else:
            if debug:
                sys.stdout.write(
                    'filter out single uniq alignment %s: mapq = 0\n' %
                    primary_alns[0].qname)
            return None

        #ambiguous_NM = 5
        #for aln in alns:
        #if aln.is_secondary and \
        #not re.search('[HS]', aln.cigarstring) and\
        #re.match('\d+M', aln.cigarstring) and re.search('\d+M$', aln.cigarstring) and\
        #int(aln.opt('NM')) - int(primary_alns[0].opt('NM')) <= ambiguous_NM:
        #if debug:
        #sys.stdout.write('secondary alignments too similar %s\n' % primary_alns[0].qname)
        #return None

        return Alignment.from_alignedRead(primary_alns[0], bam)
    else:
        return None
Example #16
0
    def alignment(self):
        """Make self into an alignment, and return it.

        If all the sequences are the same length and type, then self,
        a sequenceList, could be an Alignment.  This method generates
        an Alignment instance, runs the Alignment method
        checkLengthsAndTypes(), and returns the Alignment.

        If you feed p4 a fasta sequence, it makes SequenceList object,
        and runs this method on it.  If it works then p4 puts the
        Alignment object in var.alignments, and if not it puts the
        SequenceList object in var.sequenceLists.

        It is possible that p4 might think that some short sequences
        are DNA when they are really protein.  In that case it will
        fail to make an alignment, because it will fail the types
        check.  So what you can do is something like this::

            sl = var.sequenceLists[0]
            for s in sl.sequences:
                s.dataType = 'protein'
            a = sl.alignment()

        """

        from alignment import Alignment
        a = Alignment()
        a.fName = self.fName
        import copy
        a.sequences = copy.deepcopy(self.sequences)  # self will be deleted
        a.fName = self.fName
        a.checkLengthsAndTypes()
        return a
Example #17
0
    def make_alignment(self, source_alignment_path):
        # Make the alignment
        self.alignment = Alignment()
        self.alignment.read(source_alignment_path)

        # TODO REMOVE -- this should be part of the checking procedure
        # We start by copying the alignment
        self.alignment_path = os.path.join(the_config.start_tree_path,
                                           'source.phy')
        if os.path.exists(self.alignment_path):
            # Make sure it is the same
            old_align = Alignment()
            old_align.read(self.alignment_path)
            if not old_align.same_as(self.alignment):
                log.error("""Alignment file has changed since previous run. You
                     need to use the force-restart option.""")
                raise AnalysisError

            compare = lambda x, y: collections.Counter(
                x) == collections.Counter(y)

            if not compare(old_align.species, self.alignment.species):
                log.error(
                    """Species names in alignment have changed since previous run. You
                     need to use the force-restart option.""")
                raise AnalysisError

        else:
            self.alignment.write(self.alignment_path)
Example #18
0
def main():
    args = args_init(vars(get_args()), align=True)  # save as dictionary

    # log.info('aaaaa')

    # args['align_to_te'] = True

    ## run alignment
    map_bam_list = Alignment(**args).run()
Example #19
0
def read_internal_alignment(fn,
                            file_format='FASTA',
                            datatype=None,
                            dirs_to_delete=(),
                            temp_fs=None):
    alignment = Alignment()
    alignment.datatype = datatype
    alignment.read_filepath(fn, file_format=file_format)
    if len(alignment) >= 1:
        if dirs_to_delete:
            assert (temp_fs)
            for d in dirs_to_delete:
                time.sleep(.1)  #TODO: not sure why this is here!
                temp_fs.remove_dir(d)
        return alignment
    else:
        raise ValueError(
            "The alignment file %s has no sequences. PASTA quits." % fn)
Example #20
0
def find_chimera(alns, bam, debug=False, check_haplotype=True):
    """Determine if given alignments are chimeric

    Args:
        alns: (List) List of Pysam AlignedRead objects
        bam: (AlignmentFile) Pysam handle to BAM file - for getting reference info
        debug: (Boolean) debug mode - will output debugging statements
        check_haplotype: (Boolean) whether to screen out alignments to references
                                   containing '_'
    """
    primary_alns = []
    secondary_alns = []
    for aln in alns:
        if re.search('[HS]', aln.cigarstring) and not aln.is_secondary:
            primary_alns.append(aln)
        else:
            secondary_alns.append(aln)

    if check_haplotype and len(primary_alns) > 1:
        replace_haplotype(primary_alns, secondary_alns, bam)

    if len(primary_alns) > 1:
        aligns = [Alignment.from_alignedRead(aln, bam) for aln in primary_alns]
        bad_aligns = [align for align in aligns if not align.is_valid()]
        if bad_aligns:
            if debug:
                for align in bad_aligns:
                    sys.stdout.write('bad alignment %s %s %s %s %s %s' %
                                     (align.query, align.qstart, align.qend,
                                      align.target, align.tstart, align.tend))
        else:
            valid_secondary_aligns = []
            if secondary_alns:
                secondary_aligns = [
                    Alignment.from_alignedRead(aln, bam)
                    for aln in secondary_alns
                ]
                valid_secondary_aligns = [
                    align for align in secondary_aligns if align.is_valid()
                ]

            return aligns, valid_secondary_aligns

    return None, None
Example #21
0
def find_chimera(alns, bam, debug=False, check_haplotype=True):
    """Determine if given alignments are chimeric

    Args:
        alns: (List) List of Pysam AlignedRead objects
        bam: (AlignmentFile) Pysam handle to BAM file - for getting reference info
        debug: (Boolean) debug mode - will output debugging statements
        check_haplotype: (Boolean) whether to screen out alignments to references
                                   containing '_'
    """
    primary_alns = []
    secondary_alns = []
    for aln in alns:
        if re.search('[HS]', aln.cigarstring) and not aln.is_secondary:
            primary_alns.append(aln)
        else:
            secondary_alns.append(aln)
    
    if check_haplotype and len(primary_alns) > 1:
        replace_haplotype(primary_alns, secondary_alns, bam)
        
    if len(primary_alns) > 1:
        aligns = [Alignment.from_alignedRead(aln, bam) for aln in primary_alns]
        bad_aligns = [align for align in aligns if not align.is_valid()]
        if bad_aligns:
            if debug:
                for align in bad_aligns:
                    sys.stdout.write('bad alignment %s %s %s %s %s %s' % (align.query,
                                                                          align.qstart,
                                                                          align.qend,
                                                                          align.target,
                                                                          align.tstart,
                                                                          align.tend))
        else:
            valid_secondary_aligns = []
            if secondary_alns:
                secondary_aligns = [Alignment.from_alignedRead(aln, bam) for aln in secondary_alns]
                valid_secondary_aligns = [align for align in secondary_aligns if align.is_valid()]
                
            return aligns, valid_secondary_aligns
        
    return None, None
Example #22
0
def solveAlignment(method, fileName):
    alignment = Alignment(fileName)  # Se crea el archivo
    alignment.readFile()  # Se lee el archivo con la información
    if method == '1':  # Se elige fuerza bruta
        # start = datetime.now()
        result, result1, result2 = alignment.bruteForceSolving()  # Se resuelve
        alignment.printBruteForce(result, result1,
                                  result2)  # Se imprime los resultados
        # print(datetime.now() - start)
    elif method == '2':
        start = datetime.now()
        matrix, moves, result, result1, result2 = alignment.dynamicSolving(
        )  # Se resuelve
        alignment.printDynamic(matrix, moves, result, result1,
                               result2)  # Se imprime los resultados
        # print(datetime.now() - start)
    else:
        error(
            "Error, revise que utilice los parametros correctos. \n Utilize [-h] para ayuda."
        )
    def make_alignment(self, cfg, alignment):
        # Make an Alignment from the source, using this subset
        sub_alignment = SubsetAlignment(alignment, self)
        sub_path = os.path.join(cfg.phylofiles_path, self.name + '.phy')
        # Add it into the sub, so we keep it around
        self.alignment_path = sub_path

        # Maybe it is there already?
        if os.path.exists(sub_path):
            log.debug("Found existing alignment file %s", sub_path)
            old_align = Alignment()
            old_align.read(sub_path)

            # It had better be the same!
            if not old_align.same_as(sub_alignment):
                log.error(self.FORCE_RESTART_MESSAGE)
                raise SubsetError
        else:
            # We need to write it
            sub_alignment.write(sub_path)
Example #24
0
    def calcUnconstrainedLogLikelihood1(self):
        """Calculate likelihood under the multinomial model.

        This calculates the unconstrained (multinomial) log like
        without regard to character partitions.  The result is placed
        in the data variable unconstrainedLogLikelihood.  If there is
        more than one partition, it makes a new temporary alignment
        and puts all the sequences in one part in that alignment.  So
        it ultimately only works on one data partition.  If there is
        more than one alignment, there is possibly more than one
        datatype, and so this method will refuse to do it.  Note that
        the unconstrained log like of the combined data is not the sum
        of the unconstrained log likes of the separate partitions.

        See also calcUnconstrainedLogLikelihood2

        """

        if len(self.alignments) > 1:
            gm = ["Data.calcUnconstrainedLogLikelihood()"]
            gm.append("This method is not implemented for more than one alignment.")
            raise P4Error(gm)
        if self.nParts == 1:  # no problem
            self.unconstrainedLogLikelihood = pf.getUnconstrainedLogLike(self.parts[0].cPart)
        else:
            a = self.alignments[0]
            import copy

            newAlig = Alignment()
            newAlig.dataType = a.dataType
            newAlig.symbols = a.symbols
            newAlig.dim = a.dim
            newAlig.equates = a.equates
            newAlig.taxNames = a.taxNames
            for s in a.sequences:
                newAlig.sequences.append(copy.deepcopy(s))
            newAlig.checkLengthsAndTypes()
            newAlig._initParts()
            # newAlig.dump()
            self.unconstrainedLogLikelihood = pf.getUnconstrainedLogLike(newAlig.parts[0].cPart)
            del (newAlig)
Example #25
0
    def simulate(self, partition, outdir, batchsize=1, **kwargs):
        """
        Simulate a set of alignments from the parameters inferred on a partition
        :param partition:
        :return:
        """
        indices = partition.get_membership()
        self.add_lnl_partitions(partition, **kwargs)
        results = [self.lnl_cache[ix] for ix in indices]
        places = dict((j, i) for (i, j) in enumerate(
            rec.name for rec in self.collection.records))

        # Collect argument list
        args = [None] * len(self.collection)
        for result in results:
            for partition in result['partitions'].values():
                place = places[partition['name']]
                args[place] = (len(self.collection[place]),
                               model_translate(partition['model']),
                               partition['frequencies'], partition['alpha'],
                               result['ml_tree'], partition['rates']
                               if 'rates' in partition else None)

        # Distribute work
        msg = 'Simulating'
        client = get_client()
        if client is None:
            map_result = sequential_map(client, tasks.simulate_task, args, msg)
        else:
            map_result = parallel_map(client, tasks.simulate_task, args, msg,
                                      batchsize, background)
            if background:
                return map_result

        # Process results
        for i, result in enumerate(map_result):
            orig = self.collection[i]
            simseqs = gapmask(result, orig.get_sequences())
            al = Alignment(simseqs, 'protein' if orig.is_protein() else 'dna')
            outfile = os.path.join(outdir, orig.name + '.phy')
            al.write_alignment(outfile, 'phylip', True)
Example #26
0
    def make_alignment(self, cfg, alignment):
        # Make an Alignment from the source, using this subset
        sub_alignment = SubsetAlignment(alignment, self)

        sub_path = os.path.join(cfg.phylofiles_path, self.subset_id + '.phy')
        # Add it into the sub, so we keep it around
        self.alignment_path = sub_path

        # Maybe it is there already?
        if os.path.exists(sub_path):
            log.debug("Found existing alignment file %s" % sub_path)
            old_align = Alignment()
            old_align.read(sub_path)

            # It had better be the same!
            if not old_align.same_as(sub_alignment):
                log.error(self.FORCE_RESTART_MESSAGE)
                raise SubsetError
        else:
            # We need to write it
            sub_alignment.write(sub_path)
Example #27
0
    def alignment(self):
        """Make self into an alignment, and return it.

        If all the sequences are the same length and type, then self,
        a sequenceList, could be an Alignment.  This method generates
        an Alignment instance, runs the Alignment method
        checkLengthsAndTypes(), and returns the Alignment.

        If you feed p4 a fasta sequence, it makes SequenceList object,
        and runs this method on it.  If it works then p4 puts the
        Alignment object in var.alignments, and if not it puts the
        SequenceList object in var.sequenceLists.

        It is possible that p4 might think that some short sequences
        are DNA when they are really protein.  In that case it will
        fail to make an alignment, because it will fail the types
        check.  So what you can do is something like this::

            sl = var.sequenceLists[0]
            for s in sl.sequences:
                s.dataType = 'protein'
            a = sl.alignment()

        """

        from alignment import Alignment

        a = Alignment()
        a.fName = self.fName
        import copy

        a.sequences = copy.deepcopy(self.sequences)  # self will be deleted
        a.fName = self.fName
        a.checkLengthsAndTypes()
        return a
Example #28
0
    def simulate(self, partition, outdir, batchsize=1, **kwargs):
        """
        Simulate a set of alignments from the parameters inferred on a partition
        :param partition:
        :return:
        """
        indices = partition.get_membership()
        self.add_lnl_partitions(partition, **kwargs)
        results = [self.lnl_cache[ix] for ix in indices]
        places = dict((j,i) for (i,j) in enumerate(rec.name for rec in self.collection.records))

        # Collect argument list
        args = [None] * len(self.collection)
        for result in results:
            for partition in result['partitions'].values():
                place = places[partition['name']]
                args[place] = (len(self.collection[place]),
                               model_translate(partition['model']),
                               partition['frequencies'],
                               partition['alpha'],
                               result['ml_tree'],
                               partition['rates'] if 'rates' in partition else None)

        # Distribute work
        msg = 'Simulating'
        client = get_client()
        if client is None:
            map_result = sequential_map(client, tasks.simulate_task, args, msg)
        else:
            map_result = parallel_map(client, tasks.simulate_task, args, msg, batchsize, background)
            if background:
                return map_result

        # Process results
        for i, result in enumerate(map_result):
            orig = self.collection[i]
            simseqs = gapmask(result, orig.get_sequences())
            al = Alignment(simseqs, 'protein' if orig.is_protein() else 'dna')
            outfile = os.path.join(outdir, orig.name + '.phy')
            al.write_alignment(outfile, 'phylip', True)
Example #29
0
    def map_aligns(self,
                   bam,
                   query_fasta,
                   genome_fasta,
                   accessory_known_features=None,
                   find_events=True,
                   max_diff=1):
        mappings = defaultdict(list)
        junc_adjs = []
        events = []
        for query, group in groupby(bam.fetch(until_eof=True),
                                    lambda aln: aln.query_name):
            print 'processing', query
            aligns = []
            for aln in list(group):
                if not aln.is_unmapped:
                    aligns.append(Alignment.from_alignedRead(aln, bam))

            if not aligns:
                continue

            query_seq = query_fasta.fetch(query)

            for align in aligns:
                if not align.has_canonical_target() or align.blocks is None:
                    continue
                block_matches = self.map_align(align)
                if block_matches:
                    tid = self.pick_best_mapping(block_matches, align)
                    if tid is not None:
                        transcript = self.transcripts_dict[tid]
                        olap = self.overlap(align, transcript)
                        mappings[query].append(
                            (transcript.gene, transcript.id, olap))

                        junc_adjs.extend(
                            self.collect_junctions(align, transcript,
                                                   block_matches[tid]))

                        if find_events:
                            events.extend(
                                find_novel_junctions(block_matches[tid],
                                                     align,
                                                     transcript,
                                                     query_seq,
                                                     self.genome_fasta,
                                                     accessory_known_features=
                                                     accessory_known_features,
                                                     max_diff=max_diff))

        return mappings, junc_adjs, events
Example #30
0
def te_aligner(fq1_files, smp_name, args, fq2_files=None):
    """Mapping reads to genome
    control or treatment
    args dict, the arguments of pipeline
    check index
    1. rRNA
    2. genome
    3. spike-in-rRNA
    4. spike-in
    """
    project_path = init_rnaseq_project(args['path_out'], analysis_type=1)
    te_align_path = project_path['transposon']

    args['extra_index'] = None  # pre-build

    # ## qc-report
    # qc_path = os.path.join(te_align_path['report'], 'qc')
    # QC_reporter(fq1_files, qc_path).run() ## skip, run in gene_aligner

    ## update args
    args['fq1'] = fq1_files
    args['fq2'] = fq2_files
    args['path_out'] = te_align_path['mapping']
    args['smp_name'] = smp_name
    args['align_to_te'] = True

    # extra small genome
    small_genome = args['small_genome']
    args['small_genome'] = True

    ## run alignment
    map_bam_list = Alignment(**args).run()
    map_bam = [item for sublist in map_bam_list for item in sublist]

    # create bigWig files
    # for bam in map_bam:
    #     bam2bigwig(
    #         bam=bam,
    #         genome=args['genome'],
    #         path_out=te_align_path['bigWig'],
    #         strandness=args['s'],
    #         binsize=args['bin_size'],
    #         overwrite=args['overwrite'])

    ## return
    args['small_genome'] = small_genome

    return map_bam
Example #31
0
def find_single_unique(alns, bam, debug=False):
    """Extracts single unique alignment for indel detection
    If there is only one alignment reported by BWA-mem even when '-a' is turned on
    
    Args:
        alns: (list) Pysam AlignedRead objects of the same contig
        bam: Pysam bam handle
    Returns:
        Alignment object or None
    """
    primary_alns = [aln for aln in alns if not aln.is_unmapped and not aln.is_secondary]
    if len(primary_alns) == 1:
        if primary_alns[0].mapq > 0:            
            matched_and_insertion_len = sum([a[1] for a in primary_alns[0].cigar if a[0] <= 1])
            if float(matched_and_insertion_len) / float(primary_alns[0].rlen) < 0.95:
                if debug:
                    sys.stdout.write('best alignment less than 0.95 mapped:%s %s\n' % (alns[0].qname, alns[0].cigarstring))
                return None
        
            else:
                edit_distance = effective_edit_distance(alns[0])
                if edit_distance is not None and float(edit_distance)/float(primary_alns[0].inferred_length) > 0.1:
                    if debug:
                        sys.stdout.write('filter out single uniq alignment %s: edit distance %s - > 0.1 of contig len %d (%.01f)\n' % (alns[0].qname,
                                                                                                                                       edit_distance,
                                                                                                                                       primary_alns[0].inferred_length,
                                                                                                                                       float(edit_distance)/float(primary_alns[0].inferred_length)
                                                                                                                                       ))
                    return None
                        
        else:
            if debug:
                sys.stdout.write('filter out single uniq alignment %s: mapq = 0\n' % primary_alns[0].qname)
            return None
            
        #ambiguous_NM = 5
        #for aln in alns:
            #if aln.is_secondary and \
               #not re.search('[HS]', aln.cigarstring) and\
               #re.match('\d+M', aln.cigarstring) and re.search('\d+M$', aln.cigarstring) and\
               #int(aln.opt('NM')) - int(primary_alns[0].opt('NM')) <= ambiguous_NM:
                #if debug:
                    #sys.stdout.write('secondary alignments too similar %s\n' % primary_alns[0].qname)
                #return None
        
        return Alignment.from_alignedRead(primary_alns[0], bam) 
    else:
        return None
Example #32
0
def gene_aligner(fq1_files, smp_name, args, fq2_files=None):
    """Mapping reads to genome
    control or treatment
    args dict, the arguments of pipeline
    check index
    1. rRNA
    2. genome
    3. spike-in-rRNA
    4. spike-in
    """
    project_path = init_rnaseq_project(args['path_out'], analysis_type=1)
    gene_align_path = project_path['gene']

    ## qc-report
    qc_path = os.path.join(gene_align_path['report'], 'qc')
    # QC_reporter(fq1_files, qc_path).run()

    ## update args
    args['fq1'] = fq1_files
    args['fq2'] = fq2_files
    args['path_out'] = gene_align_path['mapping']
    args['smp_name'] = smp_name
    args['align_to_te'] = False

    ## run alignment
    map_bam_list = Alignment(**args).run()

    ## filt map_genome
    map_bam = []
    for i in map_bam_list:
        for k in i:
            if k.endswith('map_' + args['genome'] + '.bam'):
                map_bam.append(k)

    # # create bigWig files
    # for bam in map_bam:
    #     bam2bigwig(
    #         bam=bam,
    #         genome=args['genome'],
    #         path_out=gene_align_path['bigWig'],
    #         strandness=args['s'],
    #         binsize=args['bin_size'],
    #         overwrite=args['overwrite'])

    return map_bam
Example #33
0
 def em_step(self, iteration):
     ffile = open(self.ffilename)
     efile = open(self.efilename)
     afile = open(self.afilename)
     alignments = Alignment.reader_pharaoh(ffile, efile, afile)
     dirname = os.path.join(self.outputdir,
                            'iter_%s' % str(iteration + 1).rjust(3, '0'))
     os.mkdir(dirname)
     if logger.level >= 1:
         logger.writeln('\niteration %s' % (iteration + 1))
     likelihood = 0
     starttime = time.time()
     for i, alignment in enumerate(alignments, 1):
         if i % FLAGS.emtrain_log_interval == 0:
             logger.writeln('%s sentences at %s secs/sent' %
                            (i, (time.time() - starttime) / i))
             starttime = time.time()
         extractor = Extractor(
             maxabslen=100000,
             maxlen=10000,
             minhole=1,
             maxvars=100000,
             lexical_weighter=self.lexical_weighter,
             forbid_adjacent=self.forbid_adjacent,
             maximize_derivation=self.maximize_derivation,
             require_aligned_terminal=self.require_aligned_terminal)
         hg = extractor.extract_hypergraph(alignment)
         if hg is None:
             continue
         # compute expected counts
         self.compute_expected_counts(hg)
         likelihood += hg.root.inside
         treefilename = os.path.join(dirname,
                                     'tree_%s' % str(i).rjust(8, '0'))
         self.write_viterbi_tree(hg, treefilename)
         #for edge in hg.edges():
         #    logger.writeln('%s %s' % (self.counter.get_prob(edge.rule),
         #                              edge.rule))
     if logger.level >= 1:
         logger.writeln('likelihood: %s' % likelihood)
     if logger.level >= 1:
         logger.writeln('normalizing...')
     self.counter.normalize_vbdp(self.alpha, self.threshold)
     if logger.level >= 1:
         logger.writeln('prob table size: %s' % len(self.counter.prob))
Example #34
0
def process_calc(bAuthenticate):
    # connect to MongoDB
    client = MongoClient()
    db = client[database]
    # DB authentication if required
    if bAuthenticate:
        bLoggedIn = db.authenticate(username, password, source=source_database)
    else:
        bLoggedIn = True
    if bLoggedIn:
        logger.info("Authenticated")
        pd = db.Project.find_one({"project_code":"MFW001_0-010 Metro Paris-Ligne 15_T2A"})
        if pd:
            logger.info("Project %s found", pd["project_name"])
            p = Project(db, pd)
            p.load()
            found_domains = Domain.find(db, {"project_id": p._id})
            for dom in found_domains:
                d = Domain(db, dom)
                d.load()
                asets = db.AlignmentSet.find({"domain_id": d._id})
                for aset in asets:
                    a_set = AlignmentSet(db, aset)
                    a_set.load()
                    #sCode = a_set.item["code"]
                    als = Alignment.find(db, {"alignment_set_id":a_set._id}).sort("PK", 1)
                    cnt = 0.
                    cnt_tot = als.count()
                    for al in als:
                        a = Alignment(db, al)
                        a.setProject(p.item)
                        a.load()
                        cnt+=1.
                        
                        sys.stdout.write("\r{:5s} pk= {:.0f} progress= {:.0%}".format(a_set.item["code"], a.item["PK"], cnt/cnt_tot ))
                        sys.stdout.flush()
                        a.perform_calc(str(datetime.now()))
                        
    else:
        logger.error("Authentication failed")
Example #35
0
    def map_aligns(self, bam, query_fasta, genome_fasta, accessory_known_features=None, find_events=True,
                   max_diff=1):
	mappings = defaultdict(list)
	junc_adjs = []
	events = []
	for query, group in groupby(bam.fetch(until_eof=True), lambda aln: aln.query_name):
	    print 'processing', query
	    aligns = []
	    for aln in list(group):
		if not aln.is_unmapped:
		    aligns.append(Alignment.from_alignedRead(aln, bam))
		
	    if not aligns:
		continue
	    
	    query_seq = query_fasta.fetch(query)
	    	    
	    for align in aligns:
		if not align.has_canonical_target() or align.blocks is None:
		    continue
		block_matches = self.map_align(align)
		if block_matches:
		    tid = self.pick_best_mapping(block_matches, align)
		    if tid is not None:
			transcript = self.transcripts_dict[tid]
			olap = self.overlap(align, transcript)
			mappings[query].append((transcript.gene, transcript.id, olap))
			
			junc_adjs.extend(self.collect_junctions(align, transcript, block_matches[tid]))
			
			if find_events:
			    events.extend(find_novel_junctions(block_matches[tid],
			                                       align,
			                                       transcript,
			                                       query_seq,
			                                       self.genome_fasta,
			                                       accessory_known_features=accessory_known_features,
			                                       max_diff=max_diff)
			                  )
	
	return mappings, junc_adjs, events
Example #36
0
    def make_alignment(self, source_alignment_path):
        # Make the alignment
        self.alignment = Alignment()
        self.alignment.read(source_alignment_path)

        # We start by copying the alignment
        self.alignment_path = os.path.join(self.cfg.start_tree_path,
                                           'source.phy')
        if os.path.exists(self.alignment_path):
            # Make sure it is the same
            old_align = Alignment()
            old_align.read(self.alignment_path)
            if not old_align.same_as(self.alignment):
                log.error(
                    "Alignment file has changed since previous run. You need to use the force-restart option."
                )
                raise AnalysisError

        else:
            self.alignment.write(self.alignment_path)
Example #37
0
 def em_step(self, iteration):
     ffile = open(self.ffilename)
     efile = open(self.efilename)
     afile = open(self.afilename)
     alignments = Alignment.reader_pharaoh(ffile, efile, afile)
     percent_counter = PercentCounter(total=self.corpus_size)
     dirname = os.path.join(self.outputdir,
                            'iter_%s' % str(iteration + 1).rjust(3, '0'))
     os.mkdir(dirname)
     if logger.level >= 1:
         logger.writeln('\niteration %s' % (iteration + 1))
     likelihood = 0
     for i, alignment in enumerate(alignments):
         percent_counter.print_percent(i)
         # if logger.level >= 1:
         #     logger.writeln()
         #     logger.writeln('>>> sentence_pair_%s' % i)
         extractor = Extractor(lexical_weighter=self.lexical_weighter,
                               maximize_derivation=self.maximize_derivation)
         hg = extractor.extract_hypergraph(alignment)
         if hg is None:
             continue
         # compute expected counts
         self.compute_expected_counts(hg)
         likelihood += hg.root.inside
         treefilename = os.path.join(dirname,
                                     'tree_%s' % str(i + 1).rjust(8, '0'))
         self.write_viterbi_tree(hg, treefilename)
         #for edge in hg.edges():
         #    logger.writeln('%s %s' % (self.counter.get_prob(edge.rule),
         #                              edge.rule))
     if logger.level >= 1:
         logger.writeln('likelihood: %s' % likelihood)
     if logger.level >= 1:
         logger.writeln('normalizing...')
     self.counter.normalize_vbdp(self.alpha, self.threshold)
     if logger.level >= 1:
         logger.writeln('prob table size: %s' % len(self.counter.prob))
Example #38
0
def extra_aligner(fq1_files, smp_name, args, fq2_files=None):
    """Mapping reads to genome
    control or treatment
    args dict, the arguments of pipeline
    check index
    1. rRNA
    2. genome
    3. spike-in-rRNA
    4. spike-in
    """
    project_path = init_rnaseq_project(args['path_out'], analysis_type=1)
    extra_align_path = project_path['extra']

    ## qc-report
    qc_path = os.path.join(extra_align_path['report'], 'qc')
    # QC_reporter(fq1_files, qc_path).run()

    ## update args
    args['fq1'] = fq1_files
    args['fq2'] = fq2_files
    args['path_out'] = extra_align_path['mapping']
    args['smp_name'] = smp_name
    args['align_to_te'] = False

    # extra small genome, for STAR
    small_genome = args['small_genome']
    args['small_genome'] = True

    ## run alignment
    map_bam = Alignment(**args).run()

    ## return
    args['small_genome'] = small_genome

    ## return
    return map_bam
Example #39
0
fh = logging.handlers.RotatingFileHandler('export_pk.log',maxBytes=5000000, backupCount=5)
fh.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s %(name)-12s %(levelname)-8s %(message)s')
fh.setFormatter(formatter)
logger.addHandler(fh)
# reading config file
sCFGName = 'smt.cfg'
smtConfig = ConfigParser.RawConfigParser()
smtConfig.read(sCFGName)
# setup DB parameter
host = smtConfig.get('MONGODB','host')
database = smtConfig.get('MONGODB','database')
source_database = smtConfig.get('MONGODB','source_database')
username = smtConfig.get('MONGODB','username')
password = smtConfig.get('MONGODB','password')
# connect to MongoDB
client = MongoClient()
db = client[database]
# DB authentication
db.authenticate(username,password,source=source_database)
# Search for project_code = "MFW001_0-010 Metro Paris-Ligne 15_T2A"
pd = db.Project.find_one({"project_code":"MFW001_0-010 Metro Paris-Ligne 15_T2A"})
p = Project(db, pd)
p.load()
found_domains = Domain.find(db, {"project_id": p._id})
for dom in found_domains:
    d = Domain(db,dom)
    d.load()
    # Export Alignments db, domain_id, csvPk
    bDone = Alignment.export_data_by_pk(db,d._id,"../data/query_pk.csv","../data/out_pk.csv")
    print bDone
Example #40
0
    def read_alignments(self, input_dir, file_format, header_grep=None, compression=None):
        """ Get list of alignment files from an input directory *.fa, *.fas and
        *.phy files only

        Stores in self.files """

        optioncheck(compression, [None, 'gz', 'bz2'])

        if file_format == 'fasta':
            extensions = ['fa', 'fas', 'fasta']

        elif file_format == 'phylip':
            extensions = ['phy']

        else:
            extensions = []

        if compression:
            extensions = ['.'.join([x, compression]) for x in extensions]

        files = fileIO.glob_by_extensions(input_dir, extensions)
        files.sort(key=SORT_KEY)
        self._input_files = files
        records = []

        pbar = setup_progressbar("Loading files", len(files), simple_progress=True)
        pbar.start()

        for i, f in enumerate(files):
            if compression is not None:
                with fileIO.TempFile() as tmpfile:
                    with fileIO.freader(f, compression) as reader, fileIO.fwriter(tmpfile) as writer:
                        for line in reader:
                            writer.write(line)
                    try:
                        record = Alignment(tmpfile, file_format, True)
                    except RuntimeError:
                        record = Alignment(tmpfile, file_format, False)

            else:
                try:
                    record = Alignment(f, file_format, True)
                except RuntimeError:
                    record = Alignment(f, file_format, False)

            if header_grep:
                try:
                    datatype = 'dna' if record.is_dna() else 'protein'

                    record = Alignment([(header_grep(x), y) for (x, y) in record.get_sequences()], datatype)

                except TypeError:
                    raise TypeError("Couldn't apply header_grep to header\n"
                                    "alignment number={}, name={}\n"
                                    "header_grep={}".format(i, fileIO.strip_extensions(f), header_grep))
                except RuntimeError:
                    print('RuntimeError occurred processing alignment number={}, name={}'
                          .format(i, fileIO.strip_extensions(f)))
                    raise

            record.name = (fileIO.strip_extensions(f))
            records.append(record)
            pbar.update(i)
        pbar.finish()
        return records
Example #41
0
class Analysis(object):
    """Performs the analysis and collects the results"""
    def __init__(self, cfg, rpt, 
                 force_restart=False, 
                 save_phyml=False,
                 threads=-1):
        cfg.validate()

        self.cfg = cfg
        self.rpt = rpt
        self.threads = threads
        self.save_phyml = save_phyml
        self.results = results.AnalysisResults()

        log.info("Beginning Analysis")
        if force_restart:
            # Remove everything
            if os.path.exists(self.cfg.output_path):
                log.warning("Deleting all previous workings in '%s'", 
                            self.cfg.output_path)
                shutil.rmtree(self.cfg.output_path)
        else:
            # Just remove the schemes folder
            if os.path.exists(self.cfg.schemes_path):
                log.info("Removing Schemes in '%s' (they will be "
                         "recalculated from existing subset data)",
                         self.cfg.schemes_path)
                shutil.rmtree(self.cfg.schemes_path)

        #check for old analyses to see if we can use the old data
        self.cfg.check_for_old_config()

        # Make some folders for the analysis
        self.cfg.make_output_folders()
        self.make_alignment(cfg.alignment_path)

        self.make_tree(cfg.user_tree_topology_path)
        self.subsets_analysed_set = set() #a counter for user info
        self.subsets_analysed = 0 #a counter for user info
        self.total_subset_num = None
        self.schemes_analysed = 0 #a counter for user info
        self.total_scheme_num = None

    def analyse(self):
        self.do_analysis()
        self.results.finalise()
        self.report()
        return self.results

    def report(self):
        best = [
            ("Best scheme according to AIC", self.results.best_aic),
            ("Best scheme according to AICc", self.results.best_aicc),
            ("Best scheme according to BIC", self.results.best_bic),
        ]
        self.rpt.write_best_schemes(best)
        self.rpt.write_all_schemes(self.results)

    def make_alignment(self, source_alignment_path):
        # Make the alignment 
        self.alignment = Alignment()
        self.alignment.read(source_alignment_path)

        # We start by copying the alignment
        self.alignment_path = os.path.join(self.cfg.start_tree_path, 'source.phy')
        if os.path.exists(self.alignment_path):
            # Make sure it is the same
            old_align = Alignment()
            old_align.read(self.alignment_path)
            if not old_align.same_as(self.alignment):
                log.error("Alignment file has changed since previous run. "
                          "You need to use the force-restart option.")
                raise AnalysisError

        else:
            self.alignment.write(self.alignment_path)

    def make_tree(self, user_path):
        # Begin by making a filtered alignment, containing ONLY those columns
        # that are defined in the subsets
        subset_with_everything = subset.Subset(*list(self.cfg.partitions))
        self.filtered_alignment = SubsetAlignment(self.alignment, 
                                                  subset_with_everything)
        self.filtered_alignment_path = os.path.join(self.cfg.start_tree_path,
                                                    'filtered_source.phy')
        self.filtered_alignment.write(self.filtered_alignment_path)

        # Now we've written this alignment, we need to lock everything in
        # place, no more adding partitions, or changing them from now on.
        self.cfg.partitions.check_against_alignment(self.alignment)
        self.cfg.partitions.finalise()

        # We start by copying the alignment
        self.alignment_path = os.path.join(self.cfg.start_tree_path, 'source.phy')

        # Now check for the tree
        tree_path = phyml.make_tree_path(self.filtered_alignment_path)
        if not os.path.exists(tree_path):
            # If we have a user tree, then use that, otherwise, create a topology
            if user_path != None and user_path != "":
                # Copy it into the start tree folder
                log.info("Using user supplied topology at %s", user_path)
                topology_path = os.path.join(self.cfg.start_tree_path,
                                             'user_topology.phy')
                phyml.dupfile(user_path, topology_path)
            else:
                topology_path = phyml.make_topology(self.filtered_alignment_path, self.cfg.datatype)

            # Now estimate branch lengths
            if self.cfg.datatype == "DNA":
                tree_path = phyml.make_branch_lengths(self.filtered_alignment_path, topology_path)
            elif self.cfg.datatype == "protein":
                tree_path = phyml.make_branch_lengths_protein(self.filtered_alignment_path, topology_path)
                
        self.tree_path = tree_path
        log.info("Starting tree with branch lengths is here: %s", self.tree_path) 


    def analyse_subset(self, sub, models):
        """Analyse the subset using the models given
        This is the core place where everything comes together
        The results are placed into subset.result
        """

        log.debug("About to analyse %s using models %s", sub, ", ".join(list(models)))

        #keep people informed about what's going on
        #if we don't know the total subset number, we can usually get it like this
        if self.total_subset_num == None:
            self.total_subset_num = len(sub._cache)
        old_num_analysed = self.subsets_analysed
        self.subsets_analysed_set.add(sub.name)
        self.subsets_analysed = len(self.subsets_analysed_set)
        if self.subsets_analysed>old_num_analysed: #we've just analysed a subset we haven't seen yet
            percent_done = float(self.subsets_analysed)*100.0/float(self.total_subset_num)
            log.info("Analysing subset %d/%d: %.2f%s done" %(self.subsets_analysed,self.total_subset_num, percent_done, r"%"))

        subset_cache_path = os.path.join(self.cfg.subsets_path, sub.name + '.bin')
        # We might have already saved a bunch of results, try there first
        if not sub.results:
            log.debug("Reading in cached data from the subsets file")
            sub.read_cache(subset_cache_path)

        # First, see if we've already got the results loaded. Then we can
        # shortcut all the other checks
        models_done = set(sub.results.keys())
        log.debug("These models have already been done: %s", models_done)
        models_required = set(models)
        models_to_do = models_required - models_done
        log.debug("Which leaves these models still to analyse: %s", models_to_do)

        

        
        # Empty set means we're done
        if not models_to_do:
            log.debug("All models already done, so using just the cached results for subset %s", sub)
            #if models_done!=set(models): #redo model selection if we have different models
            sub.model_selection(self.cfg.model_selection, self.cfg.models)        
            return


        # Make an Alignment from the source, using this subset
        sub_alignment = SubsetAlignment(self.alignment, sub)
        sub_path = os.path.join(self.cfg.phyml_path, sub.name + '.phy')
        # Add it into the sub, so we keep it around
        sub.alignment_path = sub_path

        # Maybe it is there already?
        if os.path.exists(sub_path):
            log.debug("Found existing alignment file %s", sub_path)
            old_align = Alignment()
            old_align.read(sub_path)

            # It had better be the same!
            if not old_align.same_as(sub_alignment):
                log.error("It looks like you have changed one or more of the"
                        "data_blocks in the configuration file, "
                        "so the new subset alignments"
                        " don't match the ones stored for this analysis."
                        "You'll need to run the program with --force-restart")
                raise AnalysisError
        else:
            # We need to write it
            sub_alignment.write(sub_path)

        # Try and read in some previous analyses
        log.debug("Checking for old results in the phyml folder")
        self.parse_results(sub, models_to_do)
        if not models_to_do:
            #if models_done!=set(models): #redo model selection if we have different models
            sub.model_selection(self.cfg.model_selection, self.cfg.models)        
            return

        # What is left, we actually have to analyse...
        tasks = []

        #for efficiency, we rank the models by their difficulty - most difficult first
        difficulty = []        
        for m in models_to_do:
            difficulty.append(get_model_difficulty(m))
        
        #hat tip to http://scienceoss.com/sort-one-list-by-another-list/
        difficulty_and_m = zip(difficulty, models_to_do)
        difficulty_and_m.sort(reverse=True)
        sorted_difficulty, sorted_models_to_do = zip(*difficulty_and_m)
            
        log.debug("About to analyse these models, in this order: %s", sorted_models_to_do)
        for m in sorted_models_to_do:
            #a_path, out_path = phyml.make_analysis_path(self.cfg.phyml_path, sub.name, m)
            tasks.append((phyml.analyse, 
                          (m, sub_path, self.tree_path, self.cfg.branchlengths)))

        if self.threads == 1:
            self.run_models_concurrent(tasks)
        else:
            self.run_models_threaded(tasks)

        # Now parse the models we've just done
        self.parse_results(sub, models_to_do)

        # This should be empty NOW!
        if models_to_do:
            log.error("Failed to run models %s; not sure why", 
                      ", ".join(list(models_to_do)))
            raise AnalysisError

        # Now we have analysed all models for this subset, we do model selection
        # but ONLY on the models specified in the cfg file.
        sub.model_selection(self.cfg.model_selection, self.cfg.models)        
        
        # If we made it to here, we should write out the new summary
        self.rpt.write_subset_summary(sub)
        # We also need to update this
        sub.write_cache(subset_cache_path)

    def parse_results(self, sub, models_to_do):
        """Read in the results and parse them"""
        models_done = []
        for m in list(models_to_do):
            # sub.alignment_path
            stats_path, tree_path = phyml.make_output_path(sub.alignment_path, m)
            if os.path.exists(stats_path):
                sub_output = open(stats_path, 'rb').read()
                # Annotate with the parameters of the model
                try:
                    result = phyml.parse(sub_output)
                    sub.add_model_result(m, result)
                    # Remove the current model from remaining ones
                    models_to_do.remove(m)
                    
                    # Just used for below
                    models_done.append(m)
                    if self.save_phyml:
                        pass
                    else:
                        os.remove(stats_path)
                        os.remove(tree_path)

                except phyml.PhymlError:
                    log.warning("Failed loading parse output from %s."
                              "Output maybe corrupted. I'll run it again.",
                              stats_path)

        if models_done:
            log.debug("Loaded analysis for %s, models %s", sub, ", ".join(models_done))

    def run_models_concurrent(self, tasks):
        for func, args in tasks:
            func(*args)

    def run_models_threaded(self, tasks):
        pool = threadpool.Pool(tasks, self.threads)
        pool.join()

    def analyse_scheme(self, sch, models):
        self.schemes_analysed = self.schemes_analysed + 1        
        log.info("Analysing scheme %d/%d" %(self.schemes_analysed, self.total_scheme_num))
        for sub in sch:
            self.analyse_subset(sub, models)
 
        # AIC needs the number of sequences 
        number_of_seq = len(self.alignment.species)
        result = scheme.SchemeResult(sch, number_of_seq, self.cfg.branchlengths)
        self.results.add_scheme_result(result)

        # TODO: should put all paths into config. Then reporter should decide
        # whether to create stuff
        fname = os.path.join(self.cfg.schemes_path, sch.name+'.txt')
        self.rpt.write_scheme_summary(result, open(fname, 'w'))

        return result
Example #42
0
formatter = logging.Formatter('%(asctime)s %(name)-12s %(levelname)-8s %(message)s')
fh.setFormatter(formatter)
logger.addHandler(fh)
# reading config file
sCFGName = 'smt.cfg'
smtConfig = ConfigParser.RawConfigParser()
smtConfig.read(sCFGName)
# setup DB parameter
host = smtConfig.get('MONGODB','host')
database = smtConfig.get('MONGODB','database')
source_database = smtConfig.get('MONGODB','source_database')
username = smtConfig.get('MONGODB','username')
password = smtConfig.get('MONGODB','password')
# connect to MongoDB
client = MongoClient()
db = client[database]
# DB authentication
db.authenticate(username,password,source=source_database)
# Search for project_code = "MFW001_0-010 Metro Paris-Ligne 15_T2A"
pd = db.Project.find_one({"project_code":"MFW001_0-010 Metro Paris-Ligne 15_T2A"})
p = Project(db, pd)
p.load()
found_domains = Domain.find(db, {"project_id": p._id})
for dom in found_domains:
    d = Domain(db,dom)
    d.load()
 
    # Example of aggregation
    aggrList = Alignment.aggregate_by_strata(db, d._id)
    for ii in aggrList:
        print ii
Example #43
0
def pre_process(optmap_i, optmap_file, myfile, myfile2, output_dir,
                min_confidence):
    header_lines = 10
    header = []
    minrefoverhang = 50000
    minqryoverhang = 50000

    all_alms = {
    }  # stores all the Alignments for all groups, all_groups[ref] should contain molecule ref
    qualify_alms = {
    }  # only keep one alignment(the one with highest confidence) for each contig in one molecule
    removed = {
    }  # removed[ref,qry] == True means alignment for (ref, qry) is already removed

    # collecting alignments and store in all_groups
    print '---------------read .xmap file-------------------'
    with open(myfile + '_flip.xmap', 'rb') as csvfile:
        csvreader = csv.reader(csvfile, delimiter='\t')
        for i in range(header_lines):  # 10 lines of header
            header.append(csvreader.next())  # save them
        # read the first non-header line
        while True:
            try:
                row = csvreader.next()
                x = Alignment(int(row[1]), int(row[2]), float(row[3]),
                              float(row[4]), float(row[5]), float(row[6]),
                              row[7], float(row[8]), row[9], float(row[10]),
                              float(row[11]), int(row[12]), row[13])
                if x.ref not in all_alms:
                    all_alms[x.ref] = [x]
                else:
                    all_alms[x.ref].append(x)
            except StopIteration:
                break
    num_all_alms = 0
    for ref in all_alms:
        num_all_alms += len(all_alms[ref])
    print "In total, the number of alignments collected is ", num_all_alms

    # only keep one alignment(the one with highest confidence) for each contig in one molecule
    for ref in all_alms:
        group = all_alms[ref]
        qry_bestx = {}
        for x in group:
            if x.qry not in qry_bestx:
                qry_bestx[x.qry] = x
            else:
                if x.confidence > qry_bestx[x.qry].confidence:
                    qry_bestx[x.qry] = x

        qualify_alms[ref] = {}
        for qry in qry_bestx:
            qualify_alms[ref][qry] = qry_bestx[qry]

    num_qualify_alms = 0
    for ref in qualify_alms:
        num_qualify_alms += len(qualify_alms[ref])
    # initialize removed array
    for ref in qualify_alms:
        for qry in qualify_alms[ref]:
            removed[ref, qry] = False
    current_alms = copy_alms(qualify_alms, removed)
    output_alms(current_alms,
                output_dir + "/opt_" + str(optmap_i) + "_alms_0_initial.log")
    print "In total, the number of alignments in qualify_alms is ", num_qualify_alms

    # remove low confidence alignments
    print '---------------Remove low quality alignments---------------'
    for ref in qualify_alms:
        for qry in qualify_alms[ref]:
            x = qualify_alms[ref][qry]
            if x.confidence < min_confidence:
                removed[ref, qry] = True
                print 'alignment (', ref, ',', qry, ') is low quality and removed'
    num_alms = 0
    for ref in qualify_alms:
        for qry in qualify_alms[ref]:
            if removed[ref, qry] == False:
                num_alms += 1
    current_alms = copy_alms(qualify_alms, removed)
    output_alms(
        current_alms,
        output_dir + "/opt_" + str(optmap_i) + "_alms_1_removed_low_conf.log")
    print "After removing low confidence alignments, the number of alignments is ", num_alms
    print '---------------End---------------'

    # read optical map
    optmap = {}
    with open(optmap_file) as f_map:
        for line in f_map:
            line = line.strip()
            if line[0] == '#':
                continue
            cols = line.split('\t')
            CMapId = int(cols[0])
            LabelChannel = cols[4]
            Position = float(cols[5])

            if CMapId not in optmap:
                optmap[CMapId] = []
            if LabelChannel == "1":
                optmap[CMapId].append(Position)
    for CMapId in optmap:
        optmap[CMapId].sort()

    print '---------------scaling-------------------'
    # calculating scaling
    qry_len = {}
    with open(myfile2 + '_key.txt') as f_key:
        for i in range(0, 4):  # 4 header lines
            f_key.readline()
        for line in f_key:
            line = line.strip()
            cols = line.split('\t')
            qry_id = int(cols[0])
            seq_len = int(cols[2])
            qry_len[qry_id] = seq_len
    scaling = 0
    num = 0
    with open(myfile + '_r.cmap') as f_q:
        for i in range(0, 11):  # 11 header lines
            f_q.readline()
        for line in f_q:
            line = line.strip()
            cols = line.split('\t')
            qry_id = int(cols[0])
            appr_len = float(cols[1])
            seq_len = qry_len[qry_id]
            scaling += appr_len / seq_len
            num += 1
    scaling /= num  # scaling=1.02258059775
    scaling = 1.0
    # use scaling to adjsut coordinates of alignments
    for ref in qualify_alms:
        for qry in qualify_alms[ref]:
            x = qualify_alms[ref][qry]
            x.qrystartpos /= scaling
            x.qryendpos /= scaling
            x.qrylen /= scaling
            x.refstartpos /= scaling
            x.refendpos /= scaling
            x.reflen /= scaling

    # use scaling to adjsut coordinates of optial map
    for ref in optmap:
        for i in range(0, len(optmap[ref])):
            optmap[ref][i] /= scaling

    print '---------------END-------------------'

    # find the reference-based coordinates for each contig
    for ref in qualify_alms:
        for qry in qualify_alms[ref]:
            x = qualify_alms[ref][qry]
            if (x.orientation == '+'):
                x.qry_left_overlen = x.qrystartpos
                x.qry_right_overlen = x.qrylen - x.qryendpos
            else:
                x.qry_left_overlen = x.qrylen - x.qrystartpos
                x.qry_right_overlen = x.qryendpos
            x.start = x.refstartpos - x.qry_left_overlen
            x.end = x.refendpos + x.qry_right_overlen
            x.ref_left_overlen = x.refstartpos
            x.ref_right_overlen = x.reflen - x.refendpos
            if (x.orientation == '+'):
                x.refstart = x.qrystartpos - x.ref_left_overlen
                x.refend = x.qryendpos + x.ref_right_overlen
            else:
                x.refstart = x.qryendpos - x.ref_right_overlen
                x.refend = x.qrystartpos + x.ref_left_overlen

    num_alms = 0
    for ref in qualify_alms:
        for qry in qualify_alms[ref]:
            if removed[ref, qry] == False:
                num_alms += 1
    current_alms = copy_alms(qualify_alms, removed)
    output_alms(current_alms,
                output_dir + "/opt_" + str(optmap_i) + "_alms_2_scaled.log")
    print "After scaling, the number of alignments is ", num_alms

    # read qry map
    qry_markers = {}
    with open(myfile + '_r.cmap') as f_q:
        for i in range(11):  # 10 lines of header
            header_line = f_q.readline()
        for line in f_q:
            line = line.strip()
            cols = line.split('\t')
            CMapId = int(cols[0])
            ContigLength = float(cols[1])
            NumSites = int(cols[2])
            SiteID = int(cols[3])
            LabelChannel = cols[4]
            Position = float(cols[5])
            if LabelChannel == "0":
                continue
            if CMapId not in qry_markers:
                qry_markers[CMapId] = []
            Position /= scaling
            qry_markers[CMapId].append(Position)
    for CMapId in qry_markers:
        qry_markers[CMapId].sort()
    f_q.close()

    print '---------------candidate cutting sites-------------------'
    fpair = file(output_dir + "/chimeric_pairs_" + str(optmap_i) + ".log", 'w')
    fpair.write("ref_id\tref_pos\tqry_id\tqry_pos\n")
    chimeric_pairs = []

    for ref in qualify_alms:
        for qry in qualify_alms[ref]:
            if removed[ref, qry] == True:
                continue
            x = qualify_alms[ref][qry]

            if (x.confidence > min_confidence):
                ref_left_overlen = x.refstartpos
                ref_right_overlen = x.reflen - x.refendpos
                flag_left = False
                flag_right = False
                if (x.qry_left_overlen > minqryoverhang
                        and ref_left_overlen > minrefoverhang
                        and markers_in_qry_left_overhang(qry_markers, x) > 0):
                    flag_left = True
                    chimeric_pairs.append(
                        (x.ref, x.refstartpos, x.qry, x.qrystartpos))
                    print(
                        x.ref, x.refstartpos, x.qry,
                        x.qrystartpos), "is a pair of candidate cutting sites"
                    fpair.write(
                        str(x.ref) + "\t" + str(x.refstartpos) + "\t" +
                        str(x.qry) + "\t" + str(x.qrystartpos) + "\n")
                if (x.qry_right_overlen > minqryoverhang
                        and ref_right_overlen > minrefoverhang
                        and markers_in_qry_right_overhang(qry_markers, x) > 0):
                    flag_right = True
                    chimeric_pairs.append(
                        (x.ref, x.refendpos, x.qry, x.qryendpos))
                    print(x.ref, x.refendpos, x.qry,
                          x.qryendpos), "is a pair of candidate cutting sites"
                    fpair.write(
                        str(x.ref) + "\t" + str(x.refendpos) + "\t" +
                        str(x.qry) + "\t" + str(x.qryendpos) + "\n")
                if flag_left == True and flag_right == True:
                    removed[ref, qry] = True
    fpair.close()
    print '---------------END-------------------'
    num_alms = 0
    for ref in qualify_alms:
        for qry in qualify_alms[ref]:
            if removed[ref, qry] == False:
                num_alms += 1
    current_alms = copy_alms(qualify_alms, removed)
    output_alms(
        current_alms, output_dir + "/opt_" + str(optmap_i) +
        "_alms_3_removed_both_overhang.log")
    print "After removing alignments with both overhangs, the number of alignments is ", num_alms

    # check overlap between alignments
    for r in qualify_alms:
        for q1 in qualify_alms[r]:
            if removed[r, q1] == True:
                continue
            x = qualify_alms[r][q1]
            for q2 in qualify_alms[r]:
                if removed[r, q2] == True:
                    continue
                y = qualify_alms[r][q2]
                if q1 >= q2:
                    continue
                if x.refstartpos <= y.refstartpos and y.refstartpos <= x.refendpos:
                    overlap = min(x.refendpos, y.refendpos) - y.refstartpos
                elif y.refstartpos <= x.refstartpos and x.refstartpos <= y.refendpos:
                    overlap = min(x.refendpos, y.refendpos) - x.refstartpos
                else:
                    overlap = 0
                if overlap >= 20000:
                    if x.confidence < y.confidence:
                        removed[r, q1] = True
                    else:
                        removed[r, q2] = True
    num_alms = 0
    for ref in qualify_alms:
        for qry in qualify_alms[ref]:
            if removed[ref, qry] == False:
                num_alms += 1
    current_alms = copy_alms(qualify_alms, removed)
    output_alms(
        current_alms,
        output_dir + "/opt_" + str(optmap_i) + "_alms_4_solved_overlaps.log")
    print "After removing one of two overlap alignments, the number of alignments is ", num_alms

    return current_alms, optmap, chimeric_pairs
Example #44
0
    def analyse_subset(self, sub, models):
        """Analyse the subset using the models given
        This is the core place where everything comes together
        The results are placed into subset.result
        """

        log.debug("About to analyse %s using models %s", sub, ", ".join(list(models)))

        #keep people informed about what's going on
        #if we don't know the total subset number, we can usually get it like this
        if self.total_subset_num == None:
            self.total_subset_num = len(sub._cache)
        old_num_analysed = self.subsets_analysed
        self.subsets_analysed_set.add(sub.name)
        self.subsets_analysed = len(self.subsets_analysed_set)
        if self.subsets_analysed>old_num_analysed: #we've just analysed a subset we haven't seen yet
            percent_done = float(self.subsets_analysed)*100.0/float(self.total_subset_num)
            log.info("Analysing subset %d/%d: %.2f%s done" %(self.subsets_analysed,self.total_subset_num, percent_done, r"%"))

        subset_cache_path = os.path.join(self.cfg.subsets_path, sub.name + '.bin')
        # We might have already saved a bunch of results, try there first
        if not sub.results:
            log.debug("Reading in cached data from the subsets file")
            sub.read_cache(subset_cache_path)

        # First, see if we've already got the results loaded. Then we can
        # shortcut all the other checks
        models_done = set(sub.results.keys())
        log.debug("These models have already been done: %s", models_done)
        models_required = set(models)
        models_to_do = models_required - models_done
        log.debug("Which leaves these models still to analyse: %s", models_to_do)

        

        
        # Empty set means we're done
        if not models_to_do:
            log.debug("All models already done, so using just the cached results for subset %s", sub)
            #if models_done!=set(models): #redo model selection if we have different models
            sub.model_selection(self.cfg.model_selection, self.cfg.models)        
            return


        # Make an Alignment from the source, using this subset
        sub_alignment = SubsetAlignment(self.alignment, sub)
        sub_path = os.path.join(self.cfg.phyml_path, sub.name + '.phy')
        # Add it into the sub, so we keep it around
        sub.alignment_path = sub_path

        # Maybe it is there already?
        if os.path.exists(sub_path):
            log.debug("Found existing alignment file %s", sub_path)
            old_align = Alignment()
            old_align.read(sub_path)

            # It had better be the same!
            if not old_align.same_as(sub_alignment):
                log.error("It looks like you have changed one or more of the"
                        "data_blocks in the configuration file, "
                        "so the new subset alignments"
                        " don't match the ones stored for this analysis."
                        "You'll need to run the program with --force-restart")
                raise AnalysisError
        else:
            # We need to write it
            sub_alignment.write(sub_path)

        # Try and read in some previous analyses
        log.debug("Checking for old results in the phyml folder")
        self.parse_results(sub, models_to_do)
        if not models_to_do:
            #if models_done!=set(models): #redo model selection if we have different models
            sub.model_selection(self.cfg.model_selection, self.cfg.models)        
            return

        # What is left, we actually have to analyse...
        tasks = []

        #for efficiency, we rank the models by their difficulty - most difficult first
        difficulty = []        
        for m in models_to_do:
            difficulty.append(get_model_difficulty(m))
        
        #hat tip to http://scienceoss.com/sort-one-list-by-another-list/
        difficulty_and_m = zip(difficulty, models_to_do)
        difficulty_and_m.sort(reverse=True)
        sorted_difficulty, sorted_models_to_do = zip(*difficulty_and_m)
            
        log.debug("About to analyse these models, in this order: %s", sorted_models_to_do)
        for m in sorted_models_to_do:
            #a_path, out_path = phyml.make_analysis_path(self.cfg.phyml_path, sub.name, m)
            tasks.append((phyml.analyse, 
                          (m, sub_path, self.tree_path, self.cfg.branchlengths)))

        if self.threads == 1:
            self.run_models_concurrent(tasks)
        else:
            self.run_models_threaded(tasks)

        # Now parse the models we've just done
        self.parse_results(sub, models_to_do)

        # This should be empty NOW!
        if models_to_do:
            log.error("Failed to run models %s; not sure why", 
                      ", ".join(list(models_to_do)))
            raise AnalysisError

        # Now we have analysed all models for this subset, we do model selection
        # but ONLY on the models specified in the cfg file.
        sub.model_selection(self.cfg.model_selection, self.cfg.models)        
        
        # If we made it to here, we should write out the new summary
        self.rpt.write_subset_summary(sub)
        # We also need to update this
        sub.write_cache(subset_cache_path)
Example #45
0
class Line:
    """
    Represents a line in the SAM file.
    """

    TYPE_HEADER = 0
    TYPE_ALIGNMENT = 1

    def __init__(self, line_string):
        self.type = self.TYPE_HEADER if line_string.startswith('@') \
                else self.TYPE_ALIGNMENT

        if self.type == self.TYPE_HEADER:
            self.fields = [line_string]
            return

        self.fields = line_string.split()
        pos, cigar = self.fields[3], self.fields[5]

        if cigar == '*':
            raise CigarUnavailableError

        md = next(filter(lambda field: field.startswith('MD:Z:'), self.fields))
        md = md.replace('MD:Z:', '')
        self.alignment = Alignment(pos, cigar, md)

    def soft_clip(self, start, stop):
        if self.type == self.TYPE_HEADER:
            return

        self.strip_paired_end_info()

        self.fields[2] = '{}:{}-{}'.format(self.fields[2], start, stop)
        self.alignment.soft_clip(start, stop)
        self.fields[3] = str(self.alignment.pos)
        self.fields[5] = self.alignment.cigar
        self.fields = list(map(
                lambda field: 'MD:Z:'+self.alignment.md if \
                        field.startswith('MD:Z:') else field,
                self.fields))

    def strip_paired_end_info(self):
        '''
        fields[1]: Bitwise flags according to the SAM specifications:
              1 -- template having multiple segments in sequencing
              2 -- each segment properly aligned according to the aligner
              4 -- segment unmapped
              8 -- next segment in template unmapped
             16 -- SEQ being reverse complemented
             32 -- SEQ of the next segment in the template being reversed
                   complemented
             64 -- the first segment in the template
            128 -- the last segment in the template
            ...
        fields[6]: reference sequence name of the primary alignment of the next
                   read in the template; '*' when information is unavailable.
        fields[7]: 1-based position of the primary alignment of the next read in
                   the template; '0' when information is unavailable.
        fields[8]: signed observed template length; '0' for single-segment
                   template, or when information is unavailable.
        '''
        flags = int(self.fields[1])
        flags &= 0b00111100
        self.fields[1] = str(flags)

        self.fields[6:9] = ['*', '0', '0']

    def __repr__(self):
        return '\t'.join(self.fields)
class Person(object):

    def __init__(self, age=None, gender=None, genus='human'):
        self.player_controlled = False
        self._event_type = 'person'
        self.firstname = u"Антон"
        self.surname = u"Сычов"
        self.nickname = u"Сычуля"
        self.alignment = Alignment()
        self.features = []          # gets Feature() objects and their child's. Add new Feature only with self.add_feature()
        self.tokens = []             # Special resources to activate various events
        self.relations_tendency = {'convention': 0, 'conquest': 0, 'contribution': 0}
        #obedience, dependecy and respect stats
        self._stance = []
        self.avatar_path = ''  

        self.master = None          # If this person is a slave, the master will be set
        self.supervisor = None
        self.slaves = []
        self.subordinates = []
        self.ap = 1
        self.schedule = Schedule(self)
        self.modifiers = Modifiers()
        # init starting features
        
        self.availabe_actions = [] # used if we are playing slave-part


        self.allowance = 0         # Sparks spend each turn on a lifestyle
        self.ration = {
            "amount": 'unlimited',   # 'unlimited', 'limited' by price, 'regime' for figure, 'starvation' no food
            "food_type": "cousine",   # 'forage', 'sperm', 'dry', 'canned', 'cousine'
            "target": 0,           # figures range -2:2
            "limit": 0,             # maximum resources spend to feed character each turn
            "overfeed": 0,
        }
        self.accommodation = 'makeshift'
        self.skills = []
        self.specialized_skill = None
        self.focused_skill = None
        self.skills_used = []
        self.factors = []
        self.restrictions = []
        self._needs = init_needs(self)


        self.attributes = {
            'physique': 3,
            'mind': 3,
            'spirit': 3,
            'agility': 3,
            'sensitivity':3
        }
        self.university = {'name': 'study', 'effort': 'bad', 'auto': False}
        self.mood = 0
        self.fatigue = 0
        self._vitality = 0
        self.appetite = 0
        self.calorie_storage = 0
        self.money = 0
        self._determination = 0
        self._anxiety = 0
        self.rewards = []
        self.used_rewards = []
        self.merit = 0 # player only var for storing work result

        # Other persons known and relations with them, value[1] = [needed points, current points]
        self._relations = []
        self.selfesteem = 0
        self.conditions = []
        self.genus = init_genus(self, genus)
        self.add_feature(age)
        self.add_feature(gender)
        self.set_avatar()
        persons_list.append(self)
    

    def set_avatar(self):
        path = 'images/avatar/'
        path += self.genus.head_type + '/'
        if self.gender != None:
            if self.gender == 'sexless':
                gender = 'male'
            elif self.gender == 'shemale':
                gender = 'female'
            else:
                gender = self.gender
            path += gender + '/'
        if self.age != None:
            path += self.age + '/'
        this_avas = [ava for ava in get_avatars() if ava.startswith(path)]
        try:
            avatar = choice(this_avas)
            avatar_split = avatar.split('/')
            for str_ in avatar_split:
                if 'skin' in str_:
                    skin_color = str_.split('_')[0]
                    self.add_feature(skin_color)
                if 'hair' in str_:
                    hair_color = str_.split('_')[0]
                    self.hair_color = hair_color
            self.avatar_path = avatar
        except IndexError:
            self.avatar_path = 'images/avatar/none.jpg'

    def randomise(self, gender='female', age='adolescent'):
        self.add_feature(gender)
        self.add_feature(age)
        self.random_alignment()
        self.random_skills()
        self.random_features()
        return

    def random_alignment(self):
        # roll activity
        roll = randint(1, 100)
        if roll <= 20:
            self.alignment.activity = "timid"
        elif roll > 80:
            self.alignment.activity = "ardent"
        else:
            self.alignment.activity = "reasonable"

        # roll orderliness
        roll = randint(1, 100)
        if roll <= 20:
            self.alignment.orderliness = "chaotic"
        elif roll > 80:
            self.alignment.orderliness = "lawful"
        else:
            self.alignment.orderliness = "conformal"

        # roll morality
        roll = randint(1, 100)
        if roll <= 20:
            self.alignment.morality = "evil"
        elif roll > 80:
            self.alignment.morality = "good"
        else:
            self.alignment.morality = "selfish"

        return

    def random_skills(self, pro_skill=None, talent_skill=None):
        skilltree = ('coding', 'sport', 'conversation', 'sex', None)
        if talent_skill:
            self.skill(talent_skill).talent = True
        else:
            roll = choice(skilltree)
            if roll:
                self.skill(roll).talent = True

        if pro_skill:
            self.skill('pro_skill').profession()
        else:
            roll = choice(skilltree)
            if roll:
                self.skill(roll).profession()
        return

    def random_features(self):
        # constitution
        const = choice(('athletic', 'brawny',  'large', 'small', 'lean', 'crooked', 'clumsy'))
        roll = randint(1, 100)
        if roll > 40:
            self.add_feature(const)

        # soul
        soul = choice(('brave', 'shy', 'smart', 'dumb', 'sensitive', 'cool', None))
        if soul:
            self.add_feature(soul)

        # needs
        needstree = {'prosperity_feat': ('greedy', 'generous'),
                     'nutrition_feat': ('gourmet', 'moderate_eater'),
                     'wellness_feat': ('low_pain_threshold', 'high_pain_threshold'),
                     'comfort_feat': ('sybarite', 'ascetic'),
                     'activity_feat': ('energetic', 'lazy'),
                     'communication_feat': ('extrovert', 'introvert'),
                     'amusement_feat': ('curious', 'dull'),
                     'authority_feat': ('dominant', 'submissive'),
                     'ambition_feat': ('ambitious', 'modest'),
                     'eros_feat': ('lewd', 'frigid'), }
        for need in needstree:
            roll = randint(1, 100)
            if roll <= 20:
                self.add_feature(needstree[need][0])
            elif roll > 80:
                self.add_feature(needstree[need][1])

        return
    def change_genus(self, genus):
        self.genus = init_genus(self, genus)
    @property
    def known_characters(self):
        l = []
        for r in self._relations:
            persons = [p for p in r.persons if p != self]
            l += persons
        return l
    def add_modifier(self, name, attributes, time=None):
        self.modifiers.add_item(name, attributes, time)
    

    def count_modifiers(self, key):
        val = self.__dict__['modifiers'].get_modified_attribute(key)
        return val
    
    @property
    def focus(self):
        try:
            return self.focused_skill.focus
        except AttributeError:
            return 0
    
    @property
    def job(self):
        job = self.schedule.find_by_slot('job')
        if job == None:
            return 'idle'
        else:
            return job.name
    @property
    def minor(self):
        minor = self.schedule.find_by_slot('minor')
        if minor == None:
            return 'idle'
        else:
            return minor.name


    def show_job(self):
        job = self.schedule.find_by_slot('job')
        if not job:
            return 'idle'
        else:
            values = []
            s = ''
            for k, v in job.special_values.items():
                s += '%s: '%(k)
                try:
                    l = [i for i in v]
                    try:
                        for i in l:
                            s += '%s, '%(i.name())
                    except AttributeError:
                        for i in l:
                            s += '%s, '%(i)
                except TypeError:
                    try:
                        s += '%s, '%(v.name())
                    except AttributeError:
                        s += '%s, '%(v)
                if k not in job.special_values.items()[-1]:
                    s += '\n'
            return '%s, %s'%(job.name, s)


    def job_object(self):
        job = self.schedule.find_by_slot('job')
        if not job:
            return None
        else:
            return job
    def __getattribute__(self, key):
        if not key.startswith('__') and not key.endswith('__'):
            try:
                genus = super(Person, self).__getattribute__('genus')
                value = getattr(genus, key)
                genus.last_caller = self
                return value
            except AttributeError:
                pass
        return super(Person, self).__getattribute__(key)

    def __getattr__(self, key):
        if key in self.attributes:
            value = self.attributes[key]
            value += self.count_modifiers(key)
            if value < 1:
                value = 1
            if value > 5:
                value = 5
            return value
        n = self.get_all_needs()
        if key in n.keys():
            return n[key]
        else:
            raise AttributeError(key)


    def __setattr__(self, key, value):
        if 'attributes' in self.__dict__:
            if key in self.attributes:
                value -= self.count_modifiers(key)
                self.attributes[key] = value
                if self.attributes[key] < 0:
                    self.attributes[key] = 0
        super(Person, self).__setattr__(key, value)

    @property
    def determination(self):
        return self._determination
    @determination.setter
    def determination(self, value):
        self._determination = value
        if self._determination < 0:
            self._determination = 0
    @property
    def anxiety(self):
        return self._anxiety
    @anxiety.setter
    def anxiety(self, value):
        self._anxiety = value
        if self._anxiety < 0:
            self_anxiety = 0


    def modifiers_separate(self, modifier, names=False):
        return self.modifiers.get_modifier_separate(modifier, names)
    def vitality_info(self):
        d = {'physique': self.physique, 'shape': self.count_modifiers('shape'), 'fitness':self.count_modifiers('fitness'),
            'mood': self.mood, 'therapy': self.count_modifiers('therapy')}
        l = self.modifiers_separate('vitality', True)
        return d, l
    @property
    def vitality(self):
        l = [self.physique, self.count_modifiers('shape'), self.count_modifiers('fitness'), self.mood,
            self.count_modifiers('therapy')]
        l += self.modifiers_separate('vitality')
        l = [i for i in l if i != 0]
        lgood = []
        lbad = []
        for i in l:
            if i > 0:
                lgood.append(i)
            elif i < 0:
                lbad.append(i)
        val = 0
        bad = len(lbad)
        lgood.sort()
        for i in range(bad):
            try:
                lgood.pop(0)
            except IndexError:
                return 0
        while len(lgood) > 0:
            num = min(lgood)
            if num > val:
                val += 1
            lgood.remove(num)
        val += self._vitality
        if val > 5:
            val = 5
        return val


    @property
    def gender(self):
        try:
            gender = self.feature_by_slot('gender').name
            return gender
        except AttributeError:
            return None
    @property
    def age(self):
        try:
            gender = self.feature_by_slot('age').name
            return gender
        except AttributeError:
            return None
    def phobias(self):
        l = []
        for feature in self.features:
            if isinstance(feature, Phobia):
                l.append(feature.object_of_fear)
        return l
    def get_needs(self):
        d = {}
        for need in self._needs:
            if need.level > 0:
                d[need.name] = need
        return d

    def get_all_needs(self):
        d = {}
        for need in self._needs:
            d[need.name] = need
        return d
    def show_taboos(self):
        s = ""
        for taboo in self.taboos:
            if taboo.value != 0:
                s += "{taboo.name}({taboo.value}), ".format(taboo=taboo)
        return s


    def show_needs(self):
        s = ""
        for need in self.get_needs().values():
            s += "{need.name}({need.level}), ".format(need=need)
        return s

    def show_features(self):
        s = ""
        for feature in self.features:
            if feature.visible:
                s += "{feature.name}, ".format(feature=feature)
        return s

    def show_focus(self):
        if isinstance(self.focused_skill, Skill):
            return self.focused_skill.name
        else:
            return "No focused skill"

    def show_skills(self):
        s = ""
        for skill in self.skills:
            s += "{name}({skill.level}, {skill.attribute}({value}))".format(name=skill.name, skill=skill, value=skill.attribute_value())
            if skill != self.skills[len(self.skills)-1]:
                s += ', '
        return s

    def show_mood(self):
        m = {-1: '!!!CRUSHED!!!', 0: 'Gloomy', 1: 'Tense', 2:'Content', 3: 'Serene', 4: 'Jouful', 5:'Enthusiastic'}
        mood = self.mood
        return "{mood}({val})".format(mood=m[mood], val=mood)


    def show_attributes(self):
        s = ""
        for key in self.attributes.keys():
            s += "{0}({1})".format(key, getattr(self, key))
        return s


    def show_tokens_difficulty(self):
        s = ""
        for key, value in self.tokens_difficulty.items():
            s += "{0}({1}), ".format(key, value)
        return s

    def name(self):
        s = self.firstname + " " + self.surname
        return s


    def taboo(self, name):
        for t in self.taboos:
            if t.name == name:
                return t
        return "No taboo named %s"%(name)
  

    def skill(self, skillname):
        skill = None
        for i in self.skills:
            if i.name == skillname:
                skill = i
                return skill
            
        if skillname in skills_data:
            skill = Skill(self, skillname, skills_data[skillname])
            self.skills.append(skill)
            return skill
        else:
            raise Exception("No skill named %s in skills_data"%(skillname))
        


    def tick_features(self):
        for feature in self.features:
            feature.tick_time()
    
    def use_skill(self, name):
        if isinstance(name, Skill):
            self.skills_used.append(name)
        else:
            self.skills_used.append(self.skill(name))
    def get_used_skills(self):
        l = []
        for skill in self.skills_used:
            if isinstance(skill, Skill):
                l.append(skill)
            else:
                l.append(self.skill(skill))
        return l
    def calc_focus(self):
        if self.focused_skill:
            if self.focused_skill in self.get_used_skills():
                self.focused_skill.focus += 1
                self.skills_used = []
                return
        try:
            self.focused_skill.focus = 0
        except AttributeError:
            pass

        if len(self.skills_used) > 0:
            from collections import Counter
            counted = Counter()
            for skill in self.get_used_skills():
                counted[skill.name]+=1
            maximum = max(counted.values())
            result = []
            for skill in counted:
                if counted[skill] == maximum:
                    result.append(skill)
            self.skill(choice(result)).set_focus()
        else:
            self.focused_skill = None
        
        self.skills_used = []

    def recalculate_mood(self):
        mood = 0
        happines = []
        dissapointment = []
        dissapointments_inf = []
        satisfactions_inf = collections.defaultdict(list)
        determination = []
        anxiety = []
        for need in self.get_needs().values():
            if need.tension and need.level > 0:
                dissapointment.append(need.level)
                dissapointments_inf.append(need)
            if need.satisfaction > 0:
                happines.append(need.satisfaction)
                satisfactions_inf[need.satisfaction].append(need)
                if need.level == 3:
                    happines.append(need.satisfaction)
                    satisfactions_inf[need.satisfaction].append(need)
        for i in range(self.determination):
            happines.append(1)
            determination.append('determination')
        for i in range(self.anxiety):
            dissapointment.append(1)
            anxiety.append('anxiety')
        hlen = len(happines)
        dlen = len(dissapointment)
        happines.sort()
        dissapointment.sort()
        renpy.call_in_new_context('mood_recalc_result', dissapointments_inf, satisfactions_inf, determination, anxiety, True, self)
        if hlen > dlen:
            dissapointment = []
            for i in range(dlen):
                happines.pop(0)
            threshold = happines.count(5)
            sens = 5-self.sensitivity
            if threshold > sens:
                mood = 5
            elif threshold+happines.count(4) > sens:
                mood = 4
            elif threshold+happines.count(4)+happines.count(3) > sens:
                mood = 3
            elif threshold+happines.count(4)+happines.count(3)+happines.count(2) > sens:
                mood = 2
            elif threshold+happines.count(4)+happines.count(3)+happines.count(2)+happines.count(1) > sens:
                mood = 1

        elif hlen < dlen:
            axniety_holder = self.anxiety
            happines = []
            for i in range(hlen):
                dissapointment.pop(0)
            dissapointment = [i for i in dissapointment if i > 1]
            despair = 6-self.sensitivity-dissapointment.count(2)
            despair2 = dissapointment.count(3)
            if despair < 0:
                if abs(despair) > self.anxiety:
                    self.anxiety += 1
                    mood = -1
            else:
                despair2 -= despair
            if despair2 > 0:
                self.anxiety += despair2
                mood = -1

        
        else:
            mood = 0
        for key in satisfactions_inf:
            for need in satisfactions_inf[key]:
                need.satisfaction = 0
                need.tension = False
        for need in dissapointments_inf:
            need.satisfaction = 0
            need.tension = False
        self.mood = mood



    def motivation(self, skill=None, tense_needs=[], satisfy_needs=[], beneficiar = None, morality=0, special=[]):# needs should be a list of tuples[(need, shift)]
        motiv = 0
        motiv += morality
        for i in special:
            motiv += i
        if skill:
            if self.skill(skill).talent:
                motiv += 1
            elif self.skill(skill).inability:
                motiv -= 1

        intense = []
        self_needs = self.get_needs()
        for need in tense_needs:
            if need in self_needs.keys():
                motiv -= 1
        for need in satisfy_needs:
            if need in self_needs.keys():
                intense.append(self_needs[need].level)
        try:
            maximum = max(intense)
        except ValueError:
            maximum = 0
        motiv += maximum

        if beneficiar:
            if beneficiar == self:
                motiv += 2
            else:
                motiv += self.stance(beneficiar).value
                if self.stance(beneficiar) < 0:
                    motiv = 0
                if beneficiar == self.master or beneficiar == self.supervisor:
                    if self.stance(beneficiar).value == 0:
                        motiv = min(beneficiar.mind, beneficiar.spirit)
                    elif self.stance(beneficiar).value == 2:
                        motiv = 5
        if motiv < 0:
            motiv = 0
        if motiv > 5:
            motiv = 5

        return motiv

    

    def add_feature(self, name):    # adds features to person, if mutually exclusive removes old feature
        Feature(self, name)
    def add_phobia(self, name):
        Phobia(self, name)
    def feature_by_slot(self, slot):        # finds feature which hold needed slot
        for f in self.features:
            if f.slot == slot:
                return f
        return None

    def feature(self, name):                # finds feature with needed name if exist
        for f in self.features:
            if f.name == name:
                return f
        return None

    def remove_feature(self, feature):       # feature='str' or Fearutere()
        if isinstance(feature, str):
            for f in self.features:
                if f.name == feature:
                    f.remove()
        else:
            i = self.features.index(feature)
            self.features[i].remove()
            return


    def remove_feature_by_slot(self, slot):
        for f in self.features:
            if f.slot == slot:
                f.remove()
        

    def description(self):
        txt = self.firstname + ' "' + self.nickname + '" ' + self.surname
        txt += '\n'
        for feature in self.features:
            txt += feature.name
            txt += ','

        return txt
    def reset_needs(self):
        for need in self.get_all_needs().values():
            need.reset()
    def rest(self):
        self.conditions = []
        self.modifiers.tick_time()
        self.tick_features()
        self.schedule.use_actions()
        self.fatness_change()
        self.recalculate_mood()
        self.reset_needs()
        self.calc_focus()
        self.reduce_esteem()




    def food_demand(self):
        """
        Evaluate optimal food consumption to maintain current weight.
        :return:
        """
        demand = self.physique
        demand += self.appetite
        demand += self.count_modifiers('food_demand')

        if demand < 1:
            demand = 1

        return demand

    def food_desire(self):
        """
        Evaluate ammount of food character likes to consume.
        :return:
        """
        desire = self.food_demand()
        if self.nutrition.level == 0:
            desire -= 1
        elif self.nutrition.level == 3:
            desire += 1
        if self.feature('obese'):
            desire -= 1
        elif self.feature('emaciated'):
            desire += 2
        elif self.feature('slim'):
            desire += 1
        desire += self.count_modifiers("food_desire")

        if desire < 1:
            desire = 1

        return desire

    def get_food_consumption(self, show_multi=False):
        types = {'sperm': 0, 'forage': 0, 'dry': 1, 'canned': 2, 'cousine': 2}
        value = self.consume_food()
        multiplier = types[self.ration['food_type']]
        if show_multi:
            return value*multiplier, self.ration['food_type']
        return value * multiplier
    def consume_food(self):
        food_consumed = self.food_desire()
        fatness = self.feature_by_slot('shape')
        if fatness:
            fatness = fatness.name
        flist = ['emaciated' ,'slim', None, 'chubby', 'obese']
        val = flist.index(fatness)
        if self.ration['amount'] == 'starvation':
            food_consumed = 0

        if self.ration['amount'] == 'limited':
            if food_consumed > self.ration["limit"]:
                food_consumed = self.ration["limit"]

        if self.ration['amount'] == 'regime':
            food_consumed = self.food_demand()
            if self.ration['target'] > val:
                food_consumed += 1+self.appetite
            if self.ration['target'] < val:
                food_consumed = self.food_demand() - 1
            if self.ration['target'] == val:
                food_consumed = self.food_demand()
        return food_consumed

    def fatness_change(self):
        consumed = self.consume_food()
        demand = self.food_demand()
        desire = self.food_desire()
        calorie_difference = consumed-demand
        if consumed < desire:
            self.nutrition.set_tension()
        if self.ration['amount'] != 'starvation':
            d = {'sperm': -4, 'forage': -1, 'dry': -2, 'canned': 0, 'cousine': 3}
            if d[self.ration['food_type']] < 0:
                self.nutrition.set_tension()
            else:
                self.nutrition.satisfaction = d[self.ration['food_type']]
        self.calorie_storage += calorie_difference
        fatness = self.feature_by_slot('shape')
        if fatness != None:
            fatness = fatness.name
        flist = ['emaciated' ,'slim', None, 'chubby', 'obese']
        ind = flist.index(fatness)
        if self.calorie_storage <= 0:
            self.remove_feature('dyspnoea')
        if self.calorie_storage >= 0:
            self.remove_feature('starving')
        if self.calorie_storage < 0:
            chance = randint(-10, -1)
            if self.calorie_storage <= chance:
                ind -= 1
                if self.feature('dyspnoea'):
                    self.remove_feature('dyspnoea')
                if ind < 0:
                    ind = 0
                    if self.feature('starving'):
                        self.add_feature('dead')
                    else:
                        self.add_feature('starving')
                f = flist[ind]
                if f:
                    self.add_feature(f)
                else:
                    self.feature_by_slot('shape').remove()
                if not self.feature('starving'):
                    self.calorie_storage = 0
                return 'fatness -'
        if self.calorie_storage > 0:
            chance = randint(1, 10)
            if self.calorie_storage >= chance:
                ind += 1
                if ind > 4:
                    ind = 4
                    if self.feature('dyspnoea'):
                        self.add_feature('diabetes')
                    else:
                        self.add_feature('dyspnoea')
                f = flist[ind]
                if f:
                    self.add_feature(f)
                else:
                    self.feature_by_slot('shape').remove()
                if not self.feature('dyspnoea'):
                    self.calorie_storage = 0
                return 'fatness +'
    def nutrition_change(self, food_consumed):
        if food_consumed < self.food_demand():
            self.ration["overfeed"] -= 1
            chance = randint(-10, -1)
            if self.ration["overfeed"] <= chance:
                self.ration["overfeed"] = 0

        return

    def know_person(self, person):
        if person in self.known_characters:
            return True
        return False
    def _set_relations(self, person):
        relations = Relations(self, person)
        person._relations.append(relations)
        self._relations.append(relations)
        return relations


    def relations(self, person):
        if person==self:
            raise Exception("relations: target and caller is same person")
        if isinstance(person, Fraction):
            return self.relations(person.owner)
        if not self.know_person(person):
            relations = self._set_relations(person)
            self._set_stance(person)
            return relations
        for rel in self._relations:
            if self in rel.persons and person in rel.persons:
                return rel
    

    def _set_stance(self, person):
        stance = Stance(self, person)
        self._stance.append(stance)
        person._stance.append(stance)
        return stance

    
    def stance(self, person):
        if person==self:
            raise Exception("stance: target and caller is same person")
        if isinstance(person, Fraction):
            return self.stance(person.owner)
        elif not self.know_person(person):
            self._set_relations(person)
            stance = self._set_stance(person)

        else:
            for s in self._stance:
                if self in s.persons and person in s.persons:
                    stance = s
        if person in self.slaves:
            stance._type = 'master'
        elif person == self.master:
            stance._type = 'slave'
        else:
            stance._type = 'neutral'
        return stance


    def use_token(self, token):
        if self.has_token(token):
            self.tokens.remove(token)
        else:
            return "%s has no token named %s"%(self.name(), token)


    def has_token(self, token):
        if token in self.tokens:
            return True
        return False


    def has_any_token(self):
        if len(self.tokens) > 0:
            return True
        return False

    
    def add_token(self, token, free=False):
        if not self.has_token(token):
            self.tokens.append(token)
            if token not in ('accordance', 'antagonism'):
                if not free:
                    self.player_relations().stability += 1
                self.relations_tendency[token] += 1
            renpy.call_in_new_context('lbl_notify', self, token)


    def player_relations(self):
        for rel in self._relations:
            if rel.is_player_relations():
                return rel
        return None

    
    def moral_action(self, *args, **kwargs):
        for arg in args:
            if isinstance(arg, int):
                self.selfesteem += arg
                return 
        result = self.check_moral(*args, **kwargs)
        self.selfesteem += result
        return result
        

    def check_moral(self, *args, **kwargs):
        result = 0
        act = {'ardent': 1, 'reasonable': 0, 'timid': -1}
        moral = {'good': 1, 'selfish': 0, 'evil': -1}
        order = {'lawful': 1, 'conformal': 0, 'chaotic': -1}
        action_tones = {'activity': None, 'morality': None, 'orderliness': None}
        activity = None
        morality = None
        orderliness = None
        target = None
        
        if 'target' in kwargs:
            if isinstance(kwargs['target'], Person):
                target = kwargs['target']
        
        else:
            for arg in args:
                if isinstance(arg, Person):
                    target=arg
        
        for arg in args:
            if arg in act.keys():
                activity = arg
            if arg in moral.keys():
                morality = arg
            if arg in order.keys():
                orderliness = arg
        for k, v in action_tones.items():
            if v:
                valself = getattr(self.alignment, k)
                valact = v
                if valself != 0:
                    if valself + valact == 0:
                        result -= 1
                    elif abs(valself + valact) == 2:
                        result += 1
                elif target:
                    if valact != 0:
                        if getattr(self.relations(target), Alignment.relation_binding[k]) != valact:
                            result -= 1
                        else:
                            result += 1
        return result

    def reduce_esteem(self):
        if self.selfesteem == 0:
            return
        val = 5-self.sensitivity
        if self.selfesteem > 0:
            self.selfesteem -= val
            if val < 0:
                val = 0
        elif self.selfesteem < 0:
            self.selfesteem += val
            if val > 0:
                val = 0


    def enslave(self, target):
        target.master = self
        target.supervisor = self
        self.slaves.append(target)
        self.relations(target)

    def set_supervisor(self, supervisor):
        self.supervisor = supervisor

    def master_stance(self, target):
        if self.player_controlled:
            raise Exception('master_stance is only for npc')
        stance = self.stance(target).level
        l = ['cruel', 'opressive', 'rightful', 'benevolent']
        ind = l.index(stance)
        return ind

    def desirable_relations(self):
        d = {'lawful': ('formal', 'loyality'), 'chaotic': ('intimate', 'scum-slave'),
            'timid': ('delicate', 'worship'), 'ardent': ('intense', 'disciple'),
            'good': ('supporter', 'dedication'), 'evil': ('contradictor', 'henchman')}

        return [d.get(x) for x in self.alignment.description()]

    def willing_available(self):
        if not self.master:
            return []
        rel_check = False
        rel = self.desirable_relations()
        types = [x[1] for x in rel if isinstance(x, tuple)]
        check = [x[0] for x in rel if isinstance(x, tuple)]
        for rel in self.relations(self.master).description():
            if rel in check:
                rel_check = True
                break
        if self.stance(self.master).respect() < self.spirit:
            rel_check = False
        if not self.has_token('accordance'):
            rel_check = False
        if rel_check:
            return types
        else:
            return []


    def attitude_tendency(self):
        n = 0
        token = None
        for k, v in self.relations_tendency.items():
            if v > n:
                n = v
                token = k
        if self.relations_tendency.values().count(n) > 1:
            return None
        return token


    def add_condition(self, condition):
        if not self.has_condition(condition):
            self.conditions.append(condition)


    def has_condition(self, condition):
        if condition in self.conditions:
            return True
        return False


    def remove_condition(self, condition):
        try:
            self.conditions.remove(condition)
        except ValueError:
            pass
Example #47
0
class Analysis(object):
    """Performs the analysis and collects the results"""
    def __init__(self, cfg, force_restart, threads):
        the_config.validate()

        # TODO: Remove -- put this all into "options"
        if threads == -1:
            threads = threadpool.get_cpu_count()

        self.threads = threads

        # TODO: Move these to the config validate and prepare
        log.info("Beginning Analysis")
        self.process_restart(force_restart)

        # Make some folders for the analysis
        the_config.make_output_folders()
        the_config.database = Database(the_config)

        # Check for old analyses to see if we can use the old data
        the_config.check_for_old_config()

        # TODO: This is going to be in "Prepare"
        self.make_alignment(cfg.alignment_path)
        self.make_tree(cfg.user_tree_topology_path)

        # We need this to block the threads for critical stuff
        self.lock = threading.Condition(threading.Lock())

        # Store the result in here
        self.results = results.AnalysisResults(the_config.model_selection)

    def process_restart(self, force_restart):
        if force_restart:
            # Remove everything
            if os.path.exists(the_config.output_path):
                log.warning("Deleting all previous workings in '%s'" %
                            the_config.output_path)
                shutil.rmtree(the_config.output_path)
        else:
            # Remove the schemes folder, and clean out the phylofiles folder
            if os.path.exists(the_config.schemes_path):
                log.debug("Removing files in '%s'" % the_config.schemes_path)
                shutil.rmtree(the_config.schemes_path)
            if os.path.exists(the_config.phylofiles_path):
                log.debug("Removing files in '%s'" %
                          the_config.phylofiles_path)
                shutil.rmtree(the_config.phylofiles_path)

    def analyse(self):
        try:
            self.do_analysis()
        finally:
            # TODO: Not really the right place for it?
            the_config.database.close()
        return self.results

    def make_alignment(self, source_alignment_path):
        # Make the alignment
        self.alignment = Alignment()
        self.alignment.read(source_alignment_path)

        # TODO REMOVE -- this should be part of the checking procedure
        # We start by copying the alignment
        self.alignment_path = os.path.join(the_config.start_tree_path,
                                           'source.phy')
        if os.path.exists(self.alignment_path):
            # Make sure it is the same
            old_align = Alignment()
            old_align.read(self.alignment_path)
            if not old_align.same_as(self.alignment):
                log.error("""Alignment file has changed since previous run. You
                     need to use the force-restart option.""")
                raise AnalysisError

            compare = lambda x, y: collections.Counter(
                x) == collections.Counter(y)

            if not compare(old_align.species, self.alignment.species):
                log.error(
                    """Species names in alignment have changed since previous run. You
                     need to use the force-restart option.""")
                raise AnalysisError

        else:
            self.alignment.write(self.alignment_path)

    def need_new_tree(self, tree_path):
        if os.path.exists(tree_path):
            if ';' in open(tree_path).read():
                log.info("Starting tree file found.")
                redo_tree = False
            else:
                log.info("""Starting tree file found but it is incomplete.
                             Re-estimating""")
                redo_tree = True
        else:
            log.info("Starting tree will be estimated from the data.")
            redo_tree = True

        return redo_tree

    def make_tree(self, user_path):
        # Begin by making a filtered alignment, containing ONLY those columns
        # that are defined in the subsets
        subset_with_everything = subset_ops.merge_subsets(
            the_config.user_subsets)
        self.filtered_alignment = SubsetAlignment(self.alignment,
                                                  subset_with_everything)
        self.filtered_alignment_path = os.path.join(the_config.start_tree_path,
                                                    'filtered_source.phy')
        self.filtered_alignment.write(self.filtered_alignment_path)

        # Check the full subset against the alignment
        subset_ops.check_against_alignment(subset_with_everything,
                                           self.alignment, the_config)

        # We start by copying the alignment
        self.alignment_path = os.path.join(the_config.start_tree_path,
                                           'source.phy')

        # Now check for the tree
        tree_path = the_config.processor.make_tree_path(
            self.filtered_alignment_path)

        if self.need_new_tree(tree_path):
            log.debug("Estimating new starting tree, no old tree found")

            # If we have a user tree, then use that, otherwise, create a topology
            util.clean_out_folder(the_config.start_tree_path,
                                  keep=["filtered_source.phy", "source.phy"])

            if user_path is not None and user_path != "":
                # Copy it into the start tree folder
                log.info("Using user supplied topology at %s" % user_path)
                topology_path = os.path.join(the_config.start_tree_path,
                                             'user_topology.phy')
                util.dupfile(user_path, topology_path)
                need_bl = True
            elif the_config.no_ml_tree == True:
                log.debug("didn't find tree at %s, making a new one" %
                          tree_path)
                topology_path = the_config.processor.make_topology(
                    self.filtered_alignment_path, the_config.datatype,
                    the_config.cmdline_extras)
                need_bl = True
            elif the_config.no_ml_tree == False:
                log.debug(
                    "didn't find tree at %s, making an ML tree with RAxML" %
                    tree_path)

                tree_scheme = scheme.create_scheme(
                    the_config, "tree_scheme",
                    range(len(the_config.user_subsets)))

                topology_path = raxml.make_ml_topology(
                    self.filtered_alignment_path, the_config.datatype,
                    the_config.cmdline_extras, tree_scheme, self.threads)

                # here we copy the ML tree topology so it can be used with PhyML too
                # TODO: this is a hack, and it would be better to decide on a universal
                # name for the different types of tree we might have.
                phyml_tree = os.path.join(
                    os.path.dirname(topology_path),
                    "filtered_source.phy_phyml_tree.txt")
                copyfile(topology_path, phyml_tree)

                need_bl = False

            if need_bl == True:
                # Now estimate branch lengths
                tree_path = the_config.processor.make_branch_lengths(
                    self.filtered_alignment_path, topology_path,
                    the_config.datatype, the_config.cmdline_extras)

        self.tree_path = tree_path
        log.debug("Starting tree with branch lengths is here: %s" %
                  self.tree_path)

    def run_task(self, model_name, sub):
        # This bit should run in parallel (forking the processor)
        try:
            the_config.processor.analyse(model_name, sub.alignment_path,
                                         self.tree_path,
                                         the_config.branchlengths,
                                         the_config.cmdline_extras)
            fabricate = False
        except ExternalProgramError:
            if not the_config.suppress_errors:
                # In the Kmeans algorithm we suppress errors and "fabricate"
                # subsets (we assume the error is because the subset is too
                # small for analysis)
                raise

            # If it is kmeans we assume that the error is because the subset
            # is too small or unanalysable, so we fabricate it
            log.debug("New subset could not be analysed. It will be merged "
                      "at the end of the analysis")
            fabricate = True

        # Not entirely sure that WE NEED to block here, but it is safer to do
        # It shouldn't hold things up toooo long...
        self.lock.acquire()
        try:
            if fabricate:
                sub.fabricate_model_result(the_config, model_name)
            else:
                sub.parse_model_result(the_config, model_name)

            # Try finalising, then the result will get written out earlier...
            sub.finalise(the_config)
        finally:
            self.lock.release()

    def add_tasks_for_sub(self, tasks, sub):
        for m in sub.models_to_process:
            tasks.append((self.run_task, (m, sub)))

    def run_concurrent(self, tasks):
        for func, args in tasks:
            log.debug("About to analyse subset %s", args[1].name)
            func(*args)

    def run_threaded(self, tasks):
        if not tasks:
            return
        pool = threadpool.Pool(tasks, self.threads)
        pool.join()

    def analyse_list_of_subsets(
        self,
        all_subsets,
    ):
        # get a whole list of subsets analysed in parallel

        # analyse bigger subsets first, for efficiency
        all_subsets.sort(key=lambda x: 1.0 / float(len(x.columns)))

        # chunk the list into blocks of ~1000 tasks
        # in empirical testing, this speeds things up lot
        # though we are not entirely sure why...
        n = 1000
        n = int(n / len(the_config.models))
        if (n < 1): n = 1  # seems unlikely...

        log.debug("chunk size (in number of subsets) = %d", n)

        subset_chunks = [
            all_subsets[i:i + n] for i in xrange(0, len(all_subsets), n)
        ]

        for subsets in subset_chunks:
            # prepare the list of tasks
            tasks = []
            for sub in subsets:
                if sub.is_done:
                    pass
                elif sub.is_prepared:
                    self.add_tasks_for_sub(tasks, sub)
                else:
                    sub.prepare(the_config, self.alignment)
                    self.add_tasks_for_sub(tasks, sub)
            if tasks:
                # Now do the analysis
                if self.threads == 1:
                    self.run_concurrent(tasks)
                else:
                    self.run_threaded(tasks)

        # Now see if we're done
        for sub in all_subsets:
            # ALL subsets should already be finalised in the task. We just
            # check again here
            if not sub.finalise(the_config):
                log.error("Failed to run models %s; not sure why" % ", "
                          "".join(list(sub.models_not_done)))
                raise AnalysisError

    def analyse_scheme(self, sch):
        # Progress
        the_config.progress.next_scheme()

        # analyse the subsets in the scheme that aren't done
        # NB for most schemes we will have all subsets done, so this saves time
        not_done = []
        for sub in sch:
            if sub.is_done == False:
                not_done.append(sub)
        if not_done:
            self.analyse_list_of_subsets(not_done)

        # AIC needs the number of sequences
        number_of_seq = len(self.alignment.species)
        result = scheme.SchemeResult(sch, number_of_seq,
                                     the_config.branchlengths,
                                     the_config.model_selection)
        self.results.add_scheme_result(sch, result)

        return result
Example #48
0
import sys
from alignment import Alignment

file = sys.argv[1]
#file = 'test_files/test.txt'
args = open(file).readlines()
flag = args[0].rstrip()
scores = args[1].split()
match = int(scores[0])
mismatch = int(scores[1])
indel = int(scores[2])
seq1= args[2].rstrip()
seq2= args[3].rstrip()

if flag == 'g':
    a = Alignment(match,mismatch,indel,seq1,seq2)
    a.single_global_single_align()
    a.report_optimal_score()
elif flag == 'l':
    a = Alignment(match,mismatch,indel,seq1,seq2)
    a.local_single_align()
    a.report_optimal_score()
else:
    print "Invalid alignment flag."

file = open('results.txt', "w")
file.write("Score:")
file.write(a.get_optimal_score())
file.write("\n")
file.write("Number of Optimal Alignments:")
file.write(a.get_total_optimal_alignments())
Example #49
0
class Analysis(object):
    """Performs the analysis and collects the results"""
    def __init__(self, cfg, force_restart=False, threads=-1):
        cfg.validate()
        self.cfg = cfg
        self.threads = threads

        self.results = results.AnalysisResults(self.cfg.model_selection)

        log.info("Beginning Analysis")
        self.process_restart(force_restart)

        # Check for old analyses to see if we can use the old data
        self.cfg.check_for_old_config()

        # Make some folders for the analysis
        self.cfg.make_output_folders()
        self.make_alignment(cfg.alignment_path)
        self.make_tree(cfg.user_tree_topology_path)

        # We need this to block the threads for critical stuff
        self.lock = threading.Condition(threading.Lock())

    def process_restart(self, force_restart):
        if force_restart:
            # Remove everything
            if os.path.exists(self.cfg.output_path):
                log.warning("Deleting all previous workings in '%s'", self.cfg.output_path)
                shutil.rmtree(self.cfg.output_path)
        else:
            # Just remove the schemes folder
            if os.path.exists(self.cfg.schemes_path):
                log.info("Removing Schemes in '%s' (they will be recalculated from existing subset data)", self.cfg.schemes_path)
                shutil.rmtree(self.cfg.schemes_path)

    def analyse(self):
        self.do_analysis()
        return self.results

    def make_alignment(self, source_alignment_path):
        # Make the alignment
        self.alignment = Alignment()
        self.alignment.read(source_alignment_path)

        # We start by copying the alignment
        self.alignment_path = os.path.join(self.cfg.start_tree_path, 'source.phy')
        if os.path.exists(self.alignment_path):
            # Make sure it is the same
            old_align = Alignment()
            old_align.read(self.alignment_path)
            if not old_align.same_as(self.alignment):
                log.error("Alignment file has changed since previous run. You need to use the force-restart option.")
                raise AnalysisError

        else:
            self.alignment.write(self.alignment_path)

    def need_new_tree(self, tree_path):
        if os.path.exists(tree_path):
            if ';' in open(tree_path).read():
                log.info("Starting tree file found.")
                redo_tree = False
            else: 
                log.info("Starting tree file found but incomplete. Re-estimating")
                redo_tree = True
        else:
            log.info("No starting tree file found.")
            redo_tree = True
        
        return redo_tree

    def make_tree(self, user_path):
        # Begin by making a filtered alignment, containing ONLY those columns
        # that are defined in the subsets
        subset_with_everything = subset.Subset(*list(self.cfg.partitions))
        self.filtered_alignment = SubsetAlignment(self.alignment, subset_with_everything)
        self.filtered_alignment_path = os.path.join(self.cfg.start_tree_path, 'filtered_source.phy')
        self.filtered_alignment.write(self.filtered_alignment_path)

        # Now we've written this alignment, we need to lock everything in
        # place, no more adding partitions, or changing them from now on.
        self.cfg.partitions.check_against_alignment(self.alignment)
        self.cfg.partitions.finalise()

        # We start by copying the alignment
        self.alignment_path = os.path.join(self.cfg.start_tree_path, 'source.phy')

        # Now check for the tree
        tree_path = self.cfg.processor.make_tree_path(self.filtered_alignment_path)

        if self.need_new_tree(tree_path) == True:
            log.debug("Estimating new starting tree, no old tree found")
            
            # If we have a user tree, then use that, otherwise, create a topology
            util.clean_out_folder(self.cfg.start_tree_path, keep = ["filtered_source.phy", "source.phy"])
            
            if user_path is not None and user_path != "":
                # Copy it into the start tree folder
                log.info("Using user supplied topology at %s", user_path)
                topology_path = os.path.join(self.cfg.start_tree_path, 'user_topology.phy')
                self.cfg.processor.dupfile(user_path, topology_path)
            else:
                log.debug(
                    "didn't find tree at %s, making a new one" % tree_path)
                topology_path = self.cfg.processor.make_topology(
                    self.filtered_alignment_path, self.cfg.datatype, self.cfg.cmdline_extras)

            # Now estimate branch lengths
            tree_path = self.cfg.processor.make_branch_lengths(
                self.filtered_alignment_path,
                topology_path,
                self.cfg.datatype,
                self.cfg.cmdline_extras)

        self.tree_path = tree_path
        log.info("Starting tree with branch lengths is here: %s", self.tree_path)

    def run_task(self, m, sub):
        # This bit should run in parallel (forking the processor)
        self.cfg.processor.analyse(
            m,
            sub.alignment_path,
            self.tree_path,
            self.cfg.branchlengths,
            self.cfg.cmdline_extras
        )

        # Not entirely sure that WE NEED to block here, but it is safer to do
        # It shouldn't hold things up toooo long...
        self.lock.acquire()
        try:
            sub.parse_model_result(self.cfg, m)
            # Try finalising, then the result will get written out earlier...
            sub.finalise(self.cfg)
        finally:
            self.lock.release()

    def add_tasks_for_sub(self, tasks, sub):
        for m in sub.models_to_process:
            tasks.append((self.run_task, (m, sub)))

    def run_concurrent(self, tasks):
        for func, args in tasks:
            func(*args)

    def run_threaded(self, tasks):
        if not tasks:
            return
        pool = threadpool.Pool(tasks, self.threads)
        pool.join()

    def analyse_scheme(self, sch):
        # Progress
        self.cfg.progress.next_scheme()

        # Prepare by reading everything in first
        tasks = []
        for sub in sch:
            sub.prepare(self.cfg, self.alignment)
            self.add_tasks_for_sub(tasks, sub)

        # Now do the analysis
        if self.threads == 1:
            self.run_concurrent(tasks)
        else:
            self.run_threaded(tasks)

        # Now see if we're done
        for sub in sch:
            # ALL subsets should already be finalised in the task. We just
            # check again here
            if not sub.finalise(self.cfg):
                log.error("Failed to run models %s; not sure why", ", ".join(list(sub.models_to_do)))
                raise AnalysisError

        # AIC needs the number of sequences
        number_of_seq = len(self.alignment.species)
        result = scheme.SchemeResult(sch, number_of_seq, self.cfg.branchlengths, self.cfg.model_selection)
        self.results.add_scheme_result(sch, result)

        return result
    def __init__(self, age=None, gender=None, genus='human'):
        self.player_controlled = False
        self._event_type = 'person'
        self.firstname = u"Антон"
        self.surname = u"Сычов"
        self.nickname = u"Сычуля"
        self.alignment = Alignment()
        self.features = []          # gets Feature() objects and their child's. Add new Feature only with self.add_feature()
        self.tokens = []             # Special resources to activate various events
        self.relations_tendency = {'convention': 0, 'conquest': 0, 'contribution': 0}
        #obedience, dependecy and respect stats
        self._stance = []
        self.avatar_path = ''  

        self.master = None          # If this person is a slave, the master will be set
        self.supervisor = None
        self.slaves = []
        self.subordinates = []
        self.ap = 1
        self.schedule = Schedule(self)
        self.modifiers = Modifiers()
        # init starting features
        
        self.availabe_actions = [] # used if we are playing slave-part


        self.allowance = 0         # Sparks spend each turn on a lifestyle
        self.ration = {
            "amount": 'unlimited',   # 'unlimited', 'limited' by price, 'regime' for figure, 'starvation' no food
            "food_type": "cousine",   # 'forage', 'sperm', 'dry', 'canned', 'cousine'
            "target": 0,           # figures range -2:2
            "limit": 0,             # maximum resources spend to feed character each turn
            "overfeed": 0,
        }
        self.accommodation = 'makeshift'
        self.skills = []
        self.specialized_skill = None
        self.focused_skill = None
        self.skills_used = []
        self.factors = []
        self.restrictions = []
        self._needs = init_needs(self)


        self.attributes = {
            'physique': 3,
            'mind': 3,
            'spirit': 3,
            'agility': 3,
            'sensitivity':3
        }
        self.university = {'name': 'study', 'effort': 'bad', 'auto': False}
        self.mood = 0
        self.fatigue = 0
        self._vitality = 0
        self.appetite = 0
        self.calorie_storage = 0
        self.money = 0
        self._determination = 0
        self._anxiety = 0
        self.rewards = []
        self.used_rewards = []
        self.merit = 0 # player only var for storing work result

        # Other persons known and relations with them, value[1] = [needed points, current points]
        self._relations = []
        self.selfesteem = 0
        self.conditions = []
        self.genus = init_genus(self, genus)
        self.add_feature(age)
        self.add_feature(gender)
        self.set_avatar()
        persons_list.append(self)
Example #51
0
from alignment import align_sequences
from alignment import Alignment

from alignment.utils import merge

sequence_a = 'voldemort'
sequence_b = 'waldemort'

# align the two sequences
align_a, align_b, distance = align_sequences(sequence_a, sequence_b)
# construct a new Alignment object
alignment = Alignment.from_sequences(align_a, align_b)
# pretty print the alignment
print alignment

Example #52
0
def main(sentenceId, jsonFile, tokens, ww, wTags, depParse, inAMR, alignment, completed):
    amr = inAMR
    
    
    # clean up role names: :mod-nn and :MOD => :mod
    repltriples = [(x, r, (y,)) for x,r,(y,) in amr.triples(instances=False) if r in ['mod-NN','MOD']]
    newtriples = [(x, 'mod', (y,)) for x,r,(y,) in repltriples]
    amr = new_amr_from_old(amr, new_triples=newtriples, avoid_triples=repltriples)
    
    
    
    
    # for each triple of the form <x :-COREF y>, delete the triple and replace 
    # all occurrences of y with x
    
    
    
    triples = amr.triples(instances=False)
    
    # Use -COREF flags to establish a mapping from current to new variables 
    
    coref_triples = [trip for trip in triples if trip[1]=='-COREF']
    replacements = {}
    for coref_trip in coref_triples:
        x, _, (y,) = coref_trip
        # TODO: strengthen the choice of main concepts for the cluster
        '''
        assert amr.get_concept(x).replace('-ROOT','')==amr.get_concept(y).replace('-ROOT','') \
            or (alignment[int(y):] is not None and wTags[alignment[int(y):]]["PartOfSpeech"] in ['PRP','PRP$']) \
            or amr.get_concept(y).endswith('-FALLBACK'), (y,ww[alignment[int(y):]],x,ww[alignment[int(x):]])
        '''
        if x in replacements and replacements[x]==y: # avoid 2-node cycle
            continue
        replacements[y] = x

    # Avoid a chain of replacements, e.g. a -> b and b -> c
    # Assume there are no cycles, otherwise this will loop infinitely
    while set(replacements.keys()) & set(replacements.values()):
        for k in replacements.keys():
            if replacements[k] in replacements:
                assert replacements[k]!=k,('Self-coreferent?',k,'in',sentenceId,replacements)
                replacements[k] = replacements[replacements[k]]
                break
    
    # MERGE the coreferent nodes
    
    all_triples = []
    trip2tokAlignment = Alignment('many2one') # source side indexes 'all_triples'
    
    newtriples = []
    oldtriples = coref_triples
    for a, r, (b,) in triples:
        if r=='-COREF': continue
        trip = (a,r,(b,))
        
        change = False
        if a in replacements:
            a = replacements[a]
            change = True
        if b in replacements:
            b = replacements[b]
            change = True
        if change:
            newtriples.append((a,r,b))
            oldtriples.append(trip)
            
        if isinstance(b,basestring) and b in amr.node_to_concepts and alignment[int(b):] is not None:
            trip2tokAlignment.link(len(all_triples), alignment[int(b):])
        all_triples.append((a,r,b))
        
        
    amr = new_amr_from_old(amr, new_triples=newtriples, avoid_triples=oldtriples, avoid_concepts=replacements)
    
    
    # delete various decorations
    for k,v in amr.node_to_concepts.items():
        amr.node_to_concepts[k] = v.replace('-FALLBACK_PRON','').replace('-FALLBACK','').replace('-DATE_RELATIVE','').replace('-DATE','').replace('-TIME','')
    
    if config.verbose:
        print('Triple-to-token alignment:',{trip:ww[trip2tokAlignment[t:]]+'-'+str(trip2tokAlignment[t:]) for t,trip in enumerate(all_triples) if trip2tokAlignment[t:] is not None},
              file=sys.stderr)
    
    
    
    
    
    # delete CARDINAL concepts (cf. the nes module) unless the concept has no parent
    # e.g. in wsj_0077.14, "154.2 million shares" is converted from (s / shares :quant (c / CARDINAL :quant 154200000)) to (s / shares :quant 154200000)
    cardinals = {v for v,c in amr.node_to_concepts.items() if c=='CARDINAL'}
    for v in cardinals:
        old2newvars = {}
        triples = [(x,r,y) for x,r,(y,) in amr.triples(instances=False) if x==v or y==v]
        try:
            assert 1<=len(triples)<=2,(triples,amr)
        except AssertionError:  # something complicated; just punt
            continue
        if len(triples)<2: continue
        t1, t2 = triples
        if t1[2]!=v:
            t1, t2 = t2, t1
        assert t1[2]==t2[0]==v
        old2newvars[v] = t2[2]
        del amr.node_to_concepts[v]
        
        newtrip = (t1[0],t1[1],t2[2])
        assert newtrip[0]!=newtrip[2]
        # replace t1 and t2 with newtrip
        amr = new_amr_from_old(amr, new_triples=[newtrip], avoid_triples=[t1,t2])
        if config.verbose: print('merge CARDINAL:',[t1,t2],'->',newtrip, file=sys.stderr)
        
        t = all_triples.index(t1)
        #assert trip2tokAlignment[t:] is not None
        all_triples[t] = newtrip
        #assert trip2tokAlignment[all_triples.index(t2):] is None
        
        #amr = new_amr([(old2newvars.get(x,x), r, (old2newvars.get(y,y),)) for x,r,(y,) in amr.triples(instances=False) if x!=v], amr.node_to_concepts)
    
    # choose user-friendly variable names
    # assumes current variable names are all integer strings
    old2newvars = {}
    newconcepts = {}
    for v,c in amr.node_to_concepts.items():
        v2 = c[0].lower() if c[0].isalpha() else v
        if v2 in newconcepts:    # append numerical suffix if necessary to disambiguate
            assert v2.isalpha()
            v2 += str(sum(1 for k in newconcepts.keys() if k[0]==v2))
        newconcepts[v2] = c
        old2newvars[v] = v2
    all_triples2 = []
    trip2tokAlignment2 = Alignment('many2one')
    for x,r,(y,) in amr.triples(instances=False):
        t = all_triples.index((x,r,y))
        if trip2tokAlignment[t:] is not None:
            trip2tokAlignment2.link(len(all_triples2), trip2tokAlignment[t:])
        all_triples2.append((old2newvars.get(x,x), r, (old2newvars.get(y,y),)))
    
    finalAlignment = {trip:ww[trip2tokAlignment2[t:]]+'-'+str(trip2tokAlignment2[t:]) for t,trip in enumerate(all_triples2) if trip2tokAlignment2[t:] is not None}
    if config.verbose:
        print('Final triple-to-token alignment:',finalAlignment,
              file=sys.stderr)
    
    amr = new_amr(all_triples2, newconcepts)
    
    
    # detect orphans (variables with no triples)
    orphans = {v: True for v in newconcepts}
    for x,r,(y,) in amr.triples(instances=False):
        if r=='-DUMMY': continue
        orphans[x] = False
        if y in orphans:
            orphans[y] = False
    orphans = [v for v in orphans if orphans[v]]
    if config.verbose: print(len(orphans),'orphans',orphans, file=sys.stderr)
    
    # ensure a node has a :-DUMMY annotation iff it is an orphan
    amr = new_amr([(x,r,(y,)) for x,r,(y,) in amr.triples(instances=False) if r!='-DUMMY']+[(o,'-DUMMY','') for o in orphans], newconcepts)
    
    
    def swap_callback((x,r,(y,)),(x2,r2,(y2,))):
        #TODO: fix alignments
        pass
Example #53
0
import cv2
import sys
import numpy as np
import datetime
from alignment import Alignment
sys.path.append('../SSH')
from ssh_detector import SSHDetector

#short_max = 800
scales = [1200, 1600]
t = 2

detector = SSHDetector('../SSH/model/e2ef', 0)
alignment = Alignment('./model/3d_I5', 12)
out_filename = './out.png'

f = '../sample-images/t1.jpg'
if len(sys.argv)>1:
  f = sys.argv[1]
img = cv2.imread(f)
im_shape = img.shape
print(im_shape)
target_size = scales[0]
max_size = scales[1]
im_size_min = np.min(im_shape[0:2])
im_size_max = np.max(im_shape[0:2])
if im_size_min>target_size or im_size_max>max_size:
  im_scale = float(target_size) / float(im_size_min)
  # prevent bigger axis from being more than max_size:
  if np.round(im_scale * im_size_max) > max_size:
      im_scale = float(max_size) / float(im_size_max)
Example #54
0
    def compute_observation_probabilities(self):
        print("[++++++++] Compute probabilities of observation constraints")
        messages_aligned = Alignment.get_messages_aligned(
            self.messages,
            os.path.join(self.output_dir, Alignment.FILENAME_OUTPUT_ONELINE))
        messages_request, messages_response = Processing.divide_msgs_by_directionlist(
            self.messages, self.direction_list)
        messages_request_aligned, messages_response_aligned = Processing.divide_msgs_by_directionlist(
            messages_aligned, self.direction_list)

        fid_list_request = self.filter_fields(self.fields, self.fid_list,
                                              messages_request_aligned)
        fid_list_response = self.filter_fields(self.fields, self.fid_list,
                                               messages_response_aligned)
        logging.debug(
            "request candidate fid: {}\nresponse candidate fid: {}".format(
                fid_list_request, fid_list_response))

        # compute matrix of similarity scores
        constraint_m_request, constraint_m_response = MessageSimilarity(
            messages=messages_request_aligned), MessageSimilarity(
                messages=messages_response_aligned)
        constraint_m_request.compute_similarity_matrix()
        constraint_m_response.compute_similarity_matrix()

        # the observation prob of each cluster: {fid: the list of observation probabilities ([pm,ps,pd,pv])}
        cluster_p_request, cluster_p_response = dict(), dict()
        # the size of each cluster
        cluster_size_request, cluster_size_response = dict(), dict()
        # the observation prob of each cluster pair: {fid-fid: [,]}
        pairs_p_request, pairs_p_response = dict(), dict()
        pairs_size_request, pairs_size_response = dict(), dict()

        for fid_request in fid_list_request:
            logging.info("[++++] Test Request Field {0}-*".format(fid_request))

            # merge other fields
            fields_merged_request = self.merge_nontest_fields(
                self.fields, fid_request)
            fid_merged_request = 0 if fid_request == 0 else 1

            # generate clusters
            symbols_request_aligned = self.cluster_by_field(
                fields_merged_request, messages_request_aligned,
                fid_merged_request)
            # change symbol names
            symbols_request_aligned = self.change_symbol_name(
                symbols_request_aligned)

            # compute prob of m,s,d,v
            cluster_p_request[fid_request] = list()
            cluster_p_request[fid_request].append(
                constraint_m_request.compute_constraint_message_similarity(
                    symbols_request_aligned))
            cluster_p_request[fid_request].append(
                self.compute_constraint_structure(symbols_request_aligned))
            cluster_p_request[fid_request].append(
                self.compute_constraint_dimension(symbols_request_aligned))
            cluster_p_request[fid_request].append(
                self.compute_constraint_value(symbols_request_aligned))
            cluster_size_request[fid_request] = [
                len(s.messages) for s in symbols_request_aligned.values()
            ]

            for fid_response in fid_list_response:
                #if fid_request != fid_response:
                #    continue
                logging.debug("[++] Test Response Field {0}-{1}".format(
                    fid_request, fid_response))

                # merge other fields
                fields_merged_response = self.merge_nontest_fields(
                    self.fields, fid_response)
                fid_merged_response = 0 if fid_response == 0 else 1

                # generate clusters
                symbols_response_aligned = self.cluster_by_field(
                    fields_merged_response, messages_response_aligned,
                    fid_merged_response)
                # change symbol names
                symbols_response_aligned = self.change_symbol_name(
                    symbols_response_aligned)

                # compute prob of m,s,d,v
                if fid_response not in cluster_p_response:
                    cluster_p_response[fid_response] = list()
                    cluster_p_response[fid_response].append(
                        constraint_m_response.
                        compute_constraint_message_similarity(
                            symbols_response_aligned))
                    cluster_p_response[fid_response].append(
                        self.compute_constraint_structure(
                            symbols_response_aligned))
                    cluster_p_response[fid_response].append(
                        self.compute_constraint_dimension(
                            symbols_response_aligned))
                    cluster_p_response[fid_response].append(
                        self.compute_constraint_value(
                            symbols_response_aligned))
                    cluster_size_response[fid_response] = [
                        len(s.messages)
                        for s in symbols_response_aligned.values()
                    ]

                # print msg numbers of each cluster
                logging.debug("Number of request symbols: {0}".format(
                    len(symbols_request_aligned.values())))
                for s in symbols_request_aligned.values():
                    logging.debug("  Symbol {0} msgs numbers: {1}".format(
                        str(s.name), len(s.messages)))
                logging.debug("Number of response symbols: {0}".format(
                    len(symbols_response_aligned.values())))
                for s in symbols_response_aligned.values():
                    logging.debug("  Symbol {0} msgs numbers: {1}".format(
                        str(s.name), len(s.messages)))

                # compute remote coupling probabilities
                rc = RemoteCoupling(messages_all=messages_aligned,
                                    symbols_request=symbols_request_aligned,
                                    symbols_response=symbols_response_aligned,
                                    direction_list=self.direction_list)
                rc.compute_pairs_by_directionlist()
                fid_pair = "{}-{}".format(fid_request, fid_response)
                p_r_request = rc.compute_constraint_remote_coupling(
                    RemoteCoupling.TEST_TYPE_REQUEST)
                p_r_response = rc.compute_constraint_remote_coupling(
                    RemoteCoupling.TEST_TYPE_RESPONSE)

                logging.debug(
                    "[+] Observation Prob Results for pairs {}".format(
                        fid_pair))
                p_m, p_s, p_d, p_v = cluster_p_request[fid_request][
                    0], cluster_p_request[fid_request][1], cluster_p_request[
                        fid_request][2], cluster_p_request[fid_request][3]
                logging.debug(
                    "Request:\nPm: {0}\nPr: {1}\nPs: {2}\nPd: {3}\nPv: {4}".
                    format(p_m, p_r_request, p_s, p_d, p_v))
                pairs_p_request[fid_pair] = [p_m, p_r_request, p_s, p_d, p_v]
                pairs_size_request[fid_pair] = cluster_size_request[
                    fid_request]

                p_m, p_s, p_d, p_v = cluster_p_response[fid_response][
                    0], cluster_p_response[fid_response][
                        1], cluster_p_response[fid_response][
                            2], cluster_p_response[fid_response][3]
                logging.debug(
                    "Response:\nPm: {0}\nPr: {1}\nPs: {2}\nPd: {3}\nPv: {4}".
                    format(p_m, p_r_response, p_s, p_d, p_v))
                pairs_p_response[fid_pair] = [p_m, p_r_response, p_s, p_d, p_v]
                pairs_size_response[fid_pair] = cluster_size_response[
                    fid_response]

                del rc
                del symbols_response_aligned  #symbols
                del fields_merged_response
                gc.collect()
            del symbols_request_aligned
            del fields_merged_request
            gc.collect()

        pairs_p = [pairs_p_request, pairs_p_response]
        pairs_size = [pairs_size_request, pairs_size_response]

        return pairs_p, pairs_size
Example #55
0
class Analysis(object):
    """Performs the analysis and collects the results"""
    def __init__(self, cfg, force_restart, threads):
        the_config.validate()

        # TODO: Remove -- put this all into "options"
        if threads == -1:
            threads = threadpool.get_cpu_count()

        self.threads = threads

        # TODO: Move these to the config validate and prepare
        log.info("Beginning Analysis")
        self.process_restart(force_restart)

        # Make some folders for the analysis
        the_config.make_output_folders()
        the_config.database = Database(the_config)

        # Check for old analyses to see if we can use the old data
        the_config.check_for_old_config()

        # TODO: This is going to be in "Prepare"
        self.make_alignment(cfg.alignment_path)
        self.make_tree(cfg.user_tree_topology_path)

        # We need this to block the threads for critical stuff
        self.lock = threading.Condition(threading.Lock())

        # Store the result in here
        self.results = results.AnalysisResults(the_config.model_selection)

    def process_restart(self, force_restart):
        if force_restart:
            # Remove everything
            if os.path.exists(the_config.output_path):
                log.warning("Deleting all previous workings in '%s'" %
                            the_config.output_path)
                shutil.rmtree(the_config.output_path)
        else:
            # Remove the schemes folder, and clean out the phylofiles folder
            if os.path.exists(the_config.schemes_path):
                log.debug("Removing files in '%s'" % the_config.schemes_path)
                shutil.rmtree(the_config.schemes_path)
            if os.path.exists(the_config.phylofiles_path):
                log.debug("Removing files in '%s'" % the_config.phylofiles_path)
                shutil.rmtree(the_config.phylofiles_path)


    def analyse(self):
        try:
            self.do_analysis()
        finally:
            # TODO: Not really the right place for it?
            the_config.database.close()
        return self.results



    def make_alignment(self, source_alignment_path):
        # Make the alignment
        self.alignment = Alignment()
        self.alignment.read(source_alignment_path)

        # TODO REMOVE -- this should be part of the checking procedure
        # We start by copying the alignment
        self.alignment_path = os.path.join(the_config.start_tree_path, 'source.phy')
        if os.path.exists(self.alignment_path):
            # Make sure it is the same
            old_align = Alignment()
            old_align.read(self.alignment_path)
            if not old_align.same_as(self.alignment):
                log.error("""Alignment file has changed since previous run. You
                     need to use the force-restart option.""")
                raise AnalysisError

            compare = lambda x, y: collections.Counter(x) == collections.Counter(y)

            if not compare(old_align.species, self.alignment.species):
                log.error("""Species names in alignment have changed since previous run. You
                     need to use the force-restart option.""")
                raise AnalysisError


        else:
            self.alignment.write(self.alignment_path)

    def need_new_tree(self, tree_path):
        if os.path.exists(tree_path):
            if ';' in open(tree_path).read():
                log.info("Starting tree file found.")
                redo_tree = False
            else:
                log.info("""Starting tree file found but it is incomplete.
                             Re-estimating""")
                redo_tree = True
        else:
            log.info("Starting tree will be estimated from the data.")
            redo_tree = True

        return redo_tree

    def make_tree(self, user_path):
        # Begin by making a filtered alignment, containing ONLY those columns
        # that are defined in the subsets
        subset_with_everything = subset_ops.merge_subsets(the_config.user_subsets)
        self.filtered_alignment = SubsetAlignment(
            self.alignment, subset_with_everything)
        self.filtered_alignment_path = os.path.join(
            the_config.start_tree_path,  'filtered_source.phy')
        self.filtered_alignment.write(self.filtered_alignment_path)

        # Check the full subset against the alignment
        subset_ops.check_against_alignment(subset_with_everything, self.alignment, the_config)

        # We start by copying the alignment
        self.alignment_path = os.path.join(
            the_config.start_tree_path, 'source.phy')

        # Now check for the tree
        tree_path = the_config.processor.make_tree_path(
            self.filtered_alignment_path)

        if self.need_new_tree(tree_path):
            log.debug("Estimating new starting tree, no old tree found")

            # If we have a user tree, then use that, otherwise, create a topology
            util.clean_out_folder(the_config.start_tree_path,
                                  keep=["filtered_source.phy", "source.phy"])

            if user_path is not None and user_path != "":
                # Copy it into the start tree folder
                log.info("Using user supplied topology at %s" % user_path)
                topology_path = os.path.join(the_config.start_tree_path, 'user_topology.phy')
                util.dupfile(user_path, topology_path)
                need_bl = True
            elif the_config.no_ml_tree == True:
                log.debug(
                    "didn't find tree at %s, making a new one" % tree_path)
                topology_path = the_config.processor.make_topology(
                    self.filtered_alignment_path, the_config.datatype, the_config.cmdline_extras)
                need_bl = True
            elif the_config.no_ml_tree == False:
                log.debug(
                    "didn't find tree at %s, making an ML tree with RAxML" % tree_path)

                tree_scheme = scheme.create_scheme(
                    the_config, "tree_scheme", range(len(the_config.user_subsets)))

                topology_path = raxml.make_ml_topology(
                    self.filtered_alignment_path, the_config.datatype, the_config.cmdline_extras, tree_scheme, self.threads)
                
                # here we copy the ML tree topology so it can be used with PhyML too
                # TODO: this is a hack, and it would be better to decide on a universal
                # name for the different types of tree we might have.
                phyml_tree = os.path.join(os.path.dirname(topology_path), "filtered_source.phy_phyml_tree.txt")
                copyfile(topology_path, phyml_tree)

                need_bl = False

            if need_bl == True:
                # Now estimate branch lengths
                tree_path = the_config.processor.make_branch_lengths(
                    self.filtered_alignment_path,
                    topology_path,
                    the_config.datatype,
                    the_config.cmdline_extras)

        self.tree_path = tree_path
        log.debug("Starting tree with branch lengths is here: %s" %
                 self.tree_path)

    def run_task(self, model_name, sub):
        # This bit should run in parallel (forking the processor)
        try:
            the_config.processor.analyse(
                model_name,
                sub.alignment_path,
                self.tree_path,
                the_config.branchlengths,
                the_config.cmdline_extras
            )
            fabricate = False
        except ExternalProgramError:
            if not the_config.suppress_errors:
                # In the Kmeans algorithm we suppress errors and "fabricate"
                # subsets (we assume the error is because the subset is too
                # small for analysis)
                raise

            # If it is kmeans we assume that the error is because the subset
            # is too small or unanalysable, so we fabricate it
            log.debug("New subset could not be analysed. It will be merged "
                        "at the end of the analysis")
            fabricate = True

        # Not entirely sure that WE NEED to block here, but it is safer to do
        # It shouldn't hold things up toooo long...
        self.lock.acquire()
        try:
            if fabricate:
                sub.fabricate_model_result(the_config, model_name)
            else:
                sub.parse_model_result(the_config, model_name)

            # Try finalising, then the result will get written out earlier...
            sub.finalise(the_config)
        finally:
            self.lock.release()

    def add_tasks_for_sub(self, tasks, sub):
        for m in sub.models_to_process:
            tasks.append((self.run_task, (m, sub)))

    def run_concurrent(self, tasks):
        for func, args in tasks:
            log.debug("About to analyse subset %s", args[1].name)
            func(*args)

    def run_threaded(self, tasks):
        if not tasks:
            return
        pool = threadpool.Pool(tasks, self.threads)
        pool.join()

    def analyse_list_of_subsets(self, all_subsets, ):
        # get a whole list of subsets analysed in parallel

        # analyse bigger subsets first, for efficiency
        all_subsets.sort(key = lambda x: 1.0/float(len(x.columns)))

        # chunk the list into blocks of ~1000 tasks 
        # in empirical testing, this speeds things up lot
        # though we are not entirely sure why...
        n = 1000        
        n = int(n / len(the_config.models))
        if(n<1): n=1 # seems unlikely...

        log.debug("chunk size (in number of subsets) = %d", n)

        subset_chunks = [all_subsets[i:i + n] for i in xrange(0, len(all_subsets), n)]
        
        for subsets in subset_chunks:
            # prepare the list of tasks
            tasks = []
            for sub in subsets:
                if sub.is_done:
                    pass
                elif sub.is_prepared:
                    self.add_tasks_for_sub(tasks, sub)
                else:
                    sub.prepare(the_config, self.alignment)
                    self.add_tasks_for_sub(tasks, sub)
            if tasks:
                # Now do the analysis
                if self.threads == 1:
                    self.run_concurrent(tasks)
                else:
                    self.run_threaded(tasks)

        # Now see if we're done
        for sub in all_subsets:
            # ALL subsets should already be finalised in the task. We just
            # check again here
            if not sub.finalise(the_config):
                log.error("Failed to run models %s; not sure why" %
                          ", " "".join(list(sub.models_not_done)))
                raise AnalysisError

    def analyse_scheme(self, sch):
        # Progress
        the_config.progress.next_scheme()

        # analyse the subsets in the scheme that aren't done
        # NB for most schemes we will have all subsets done, so this saves time
        not_done = []
        for sub in sch:
            if sub.is_done == False:
                not_done.append(sub)
        if not_done:
            self.analyse_list_of_subsets(not_done)

        # AIC needs the number of sequences
        number_of_seq = len(self.alignment.species)
        result = scheme.SchemeResult(sch, number_of_seq, the_config.branchlengths, the_config.model_selection)
        self.results.add_scheme_result(sch, result)

        return result