Esempi in Python per relationship

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: conekt.db

Metodo/funzione: relationship

Esempi su hotexamples.com: 20

relationship in Python: 20 esempi trovati. Questi sono i migliori esempi reali in Python per conekt.db.relationship, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Esempio n. 1

Mostra file

class ClusterCladeEnrichment(db.Model):
    __tablename__ = 'cluster_clade_enrichment'
    __table_args__ = {'extend_existing': True}

    id = db.Column(db.Integer, primary_key=True)
    cluster_id = db.Column(
        db.Integer,
        db.ForeignKey('coexpression_clusters.id', ondelete='CASCADE'))
    clade_id = db.Column(db.Integer,
                         db.ForeignKey('clades.id', ondelete='CASCADE'))

    gene_family_method_id = db.Column(
        db.Integer, db.ForeignKey('gene_family_methods.id',
                                  ondelete='CASCADE'))

    gene_family_method = db.relationship('GeneFamilyMethod',
                                         backref=db.backref(
                                             'clade_enrichment',
                                             lazy='dynamic',
                                             passive_deletes=True),
                                         lazy='joined')

    cluster = db.relationship('CoexpressionCluster',
                              backref=db.backref('clade_enrichment',
                                                 lazy='dynamic',
                                                 passive_deletes=True),
                              lazy='joined')

    clade = db.relationship('Clade',
                            backref=db.backref('enriched_clusters',
                                               lazy='dynamic',
                                               passive_deletes=True),
                            lazy='joined')
    """
    Counts required to calculate the enrichment,
    store here for quick access
    """
    cluster_count = db.Column(db.Integer)
    cluster_size = db.Column(db.Integer)
    clade_count = db.Column(db.Integer)
    clade_size = db.Column(db.Integer)
    """
    Enrichment score (log-transformed), p-value and corrected p-value. Calculated using the hypergeometric
    distribution and applying FDR correction (aka. BH)
    """
    enrichment = db.Column(db.Float)
    p_value = db.Column(db.Float)
    corrected_p_value = db.Column(db.Float)

    @property
    def cluster_percentage(self):
        return self.cluster_count * 100 / self.cluster_size

    @property
    def genome_percentage(self):
        return self.clade_count * 100 / self.clade_size

Esempio n. 2

Mostra file

class CoexpressionClusterSimilarity(db.Model):
    __tablename__ = 'coexpression_cluster_similarity'
    __table_args__ = {'extend_existing': True}

    id = db.Column(db.Integer, primary_key=True)
    source_id = db.Column(
        db.Integer,
        db.ForeignKey('coexpression_clusters.id', ondelete='CASCADE'))
    target_id = db.Column(
        db.Integer,
        db.ForeignKey('coexpression_clusters.id', ondelete='CASCADE'))

    gene_family_method_id = db.Column('gene_family_method_id',
                                      db.Integer,
                                      db.ForeignKey('gene_family_methods.id',
                                                    ondelete='CASCADE'),
                                      index=True)

    jaccard_index = db.Column(db.Float, index=True)
    p_value = db.Column(db.Float, index=True)
    corrected_p_value = db.Column(db.Float, index=True)

    source = db.relationship('CoexpressionCluster',
                             backref=db.backref('similarity_sources',
                                                lazy='dynamic',
                                                passive_deletes=True),
                             lazy='joined',
                             foreign_keys=[source_id])

    target = db.relationship('CoexpressionCluster',
                             backref=db.backref('similarity_targets',
                                                lazy='dynamic',
                                                passive_deletes=True),
                             lazy='joined',
                             foreign_keys=[target_id])

    gene_family_method = db.relationship('GeneFamilyMethod',
                                         backref=db.backref(
                                             'CoexpressionClusterSimilarities',
                                             passive_deletes=True),
                                         lazy='joined')

    @staticmethod
    def empty_table():
        """
        Delete all content from this table. Use carefully !
        """
        CoexpressionClusterSimilarity.query.delete()

Esempio n. 3

Mostra file

class SequenceSequenceCladeAssociation(db.Model):
    __tablename__ = 'sequence_sequence_clade'
    __table_args__ = {'extend_existing': True}

    id = db.Column(db.Integer, primary_key=True)

    sequence_one_id = db.Column(db.Integer, db.ForeignKey('sequences.id', ondelete='CASCADE'))
    sequence_two_id = db.Column(db.Integer, db.ForeignKey('sequences.id', ondelete='CASCADE'))

    clade_id = db.Column(db.Integer, db.ForeignKey('clades.id', ondelete='CASCADE'), index=True)
    tree_id = db.Column(db.Integer, db.ForeignKey('trees.id', ondelete='CASCADE'), index=True)

    duplication = db.Column(db.SmallInteger)
    duplication_consistency_score = db.Column(db.Float)

    tree = db.relationship('Tree', lazy='joined',
                           backref=db.backref('sequence_sequence_clade_associations',
                                              lazy='dynamic',
                                              passive_deletes=True)
                           )

    clade = db.relationship('Clade', lazy='joined',
                            backref=db.backref('sequence_sequence_clade_associations',
                                               lazy='dynamic',
                                               passive_deletes=True)
                            )

    def __str__(self):
        return "%d" % self.id

    @property
    def readable_type(self):
        """
        Returns type (duplication or speciation) in a human-readable format

        :return: string Duplication or Speciation
        """
        return "Duplication" if self.duplication else "Speciation"

    @property
    def readable_score(self):
        """
        Returns the duplication consistency score in a nicer format

        :return: string with dup. consistency score in .%3 - format. Or "Not available" for speciations.
        """
        return "%.3f" % self.duplication_consistency_score if self.duplication else "Not available"

Esempio n. 4

Mostra file

class SequenceFamilyAssociation(db.Model):
    __tablename__ = 'sequence_family'
    __table_args__ = {'extend_existing': True}

    id = db.Column(db.Integer, primary_key=True)
    sequence_id = db.Column(db.Integer,
                            db.ForeignKey('sequences.id', ondelete='CASCADE'))
    gene_family_id = db.Column(
        db.Integer, db.ForeignKey('gene_families.id', ondelete='CASCADE'))

    sequence = db.relationship('Sequence',
                               backref=db.backref('family_associations',
                                                  lazy='dynamic',
                                                  passive_deletes=True),
                               lazy='joined')
    family = db.relationship('GeneFamily',
                             backref=db.backref('sequence_associations',
                                                lazy='dynamic',
                                                passive_deletes=True),
                             lazy='joined')

Esempio n. 5

Mostra file

class FamilyGOAssociation(db.Model):
    __tablename__ = 'family_go'
    __table_args__ = {'extend_existing': True}

    id = db.Column(db.Integer, primary_key=True)
    gene_family_id = db.Column(
        db.Integer, db.ForeignKey('gene_families.id', ondelete='CASCADE'))
    go_id = db.Column(db.Integer, db.ForeignKey('go.id', ondelete='CASCADE'))

    gene_family = db.relationship('GeneFamily',
                                  backref=db.backref('go_annotations',
                                                     lazy='dynamic',
                                                     passive_deletes=True),
                                  lazy='joined')

    go_term = db.relationship('GO',
                              backref=db.backref('family_associations',
                                                 lazy='dynamic',
                                                 passive_deletes=True),
                              lazy='joined')

Esempio n. 6

Mostra file

class SequenceInterproAssociation(db.Model):
    __tablename__ = 'sequence_interpro'
    __table_args__ = {'extend_existing': True}

    id = db.Column(db.Integer, primary_key=True)
    sequence_id = db.Column(db.Integer,
                            db.ForeignKey('sequences.id', ondelete='CASCADE'))
    interpro_id = db.Column(db.Integer,
                            db.ForeignKey('interpro.id', ondelete='CASCADE'))
    start = db.Column(db.Integer, default=None)
    stop = db.Column(db.Integer, default=None)

    sequence = db.relationship('Sequence',
                               backref=db.backref('interpro_associations',
                                                  lazy='dynamic',
                                                  passive_deletes=True),
                               lazy='joined')

    domain = db.relationship('Interpro',
                             backref=db.backref('sequence_associations',
                                                lazy='dynamic',
                                                passive_deletes=True),
                             lazy='joined')

Esempio n. 7

Mostra file

class SequenceCoexpressionClusterAssociation(db.Model):
    __tablename__ = 'sequence_coexpression_cluster'
    __table_args__ = {'extend_existing': True}

    id = db.Column(db.Integer, primary_key=True)
    probe = db.Column(db.String(50), index=True)
    sequence_id = db.Column(db.Integer,
                            db.ForeignKey('sequences.id', ondelete='CASCADE'))
    coexpression_cluster_id = db.Column(
        db.Integer,
        db.ForeignKey('coexpression_clusters.id', ondelete='CASCADE'))

    sequence = db.relationship('Sequence',
                               backref=db.backref(
                                   'coexpression_cluster_associations',
                                   lazy='dynamic',
                                   passive_deletes=True),
                               lazy='joined')
    coexpression_cluster = db.relationship('CoexpressionCluster',
                                           backref=db.backref(
                                               'sequence_associations',
                                               lazy='dynamic',
                                               passive_deletes=True),
                                           lazy='joined')

Esempio n. 8

Mostra file

class SequenceGOAssociation(db.Model):
    __tablename__ = 'sequence_go'
    __table_args__ = {'extend_existing': True}

    id = db.Column(db.Integer, primary_key=True)
    sequence_id = db.Column(db.Integer,
                            db.ForeignKey('sequences.id', ondelete='CASCADE'))
    go_id = db.Column(db.Integer, db.ForeignKey('go.id', ondelete='CASCADE'))

    evidence = db.Column(
        db.Enum('EXP',
                'IDA',
                'IPI',
                'IMP',
                'IGI',
                'IEP',
                'ISS',
                'ISO',
                'ISA',
                'ISM',
                'IGC',
                'IBA',
                'IBD',
                'IKR',
                'IRD',
                'RCA',
                'TAS',
                'NAS',
                'IC',
                'ND',
                'IEA',
                name='evidence'))
    source = db.Column(db.Text)

    predicted = db.Column(db.SmallInteger, default=False)
    prediction_data = db.Column(db.Text)

    sequence = db.relationship('Sequence',
                               backref=db.backref('go_associations',
                                                  lazy='dynamic',
                                                  passive_deletes=True),
                               lazy='joined')

    go = db.relationship('GO',
                         backref=db.backref('sequence_associations',
                                            lazy='dynamic',
                                            passive_deletes=True),
                         lazy='joined')

    def __init__(self,
                 sequence_id,
                 go_id,
                 evidence,
                 source,
                 predicted=False,
                 prediction_data=None):
        self.sequence_id = sequence_id
        self.go_id = go_id
        self.evidence = evidence
        self.source = source
        self.predicted = predicted
        self.prediction_data = prediction_data

    @property
    def data(self):
        """
        Property to get the information in the prediction_data as a dict. Useful for showing these values in e.g. jinja2
        templates

        :return: de-serialized prediction_data (json)
        """
        return json.loads(self.prediction_data)

Esempio n. 9

Mostra file

class ExpressionNetworkMethod(db.Model):
    __tablename__ = 'expression_network_methods'
    id = db.Column(db.Integer, primary_key=True)
    species_id = db.Column(db.Integer, db.ForeignKey('species.id'), index=True)
    description = db.Column(db.Text)
    edge_type = db.Column(db.Enum("rank", "weight", name='edge_type'))
    probe_count = db.Column(db.Integer)

    hrr_cutoff = db.Column(db.Integer)
    pcc_cutoff = db.Column(db.Float)
    enable_second_level = db.Column(db.SmallInteger)

    probes = db.relationship('ExpressionNetwork',
                             backref=db.backref('method', lazy='joined'),
                             lazy='dynamic',
                             cascade="all, delete-orphan",
                             passive_deletes=True)

    clustering_methods = db.relationship('CoexpressionClusteringMethod',
                                         backref='network_method',
                                         lazy='dynamic',
                                         cascade='all, delete-orphan',
                                         passive_deletes=True)

    def __init__(self, species_id, description, edge_type="rank"):
        self.species_id = species_id
        self.description = description
        self.edge_type = edge_type
        self.enable_second_level = False

    def __repr__(self):
        return str(self.id) + ". " + self.description + ' [' + str(self.species) + ']'

    @staticmethod
    def update_count():
        """
        To avoid long count queries the number of networks for each method can be precalculated and stored in the
        database using this function
        """
        methods = ExpressionNetworkMethod.query.all()

        for m in methods:
            m.probe_count = m.probes.count()

        try:
            db.session.commit()
        except Exception as e:
            db.session.rollback()
            print(e)

    @staticmethod
    @benchmark
    def calculate_ecc(network_method_ids, gene_family_method_id, max_size=100):
        """
        Function to calculate the ECC scores in and between genes of different networks

        ORM free method for speed !

        :param network_method_ids: array of networks (using their internal id !) to compare
        :param gene_family_method_id: internal id of the type of family methods to be used for the comparison
        """

        network_families = {}
        sequence_network = {}
        sequence_network_method = {}
        sequence_family = {}
        family_sequence = {}

        # Get all the network information and store in dictionary
        for n in network_method_ids:
            current_network = db.engine.execute(db.select([ExpressionNetwork.__table__.c.sequence_id,
                                                           ExpressionNetwork.__table__.c.network,
                                                           ExpressionNetwork.__table__.c.method_id]).
                                                where(ExpressionNetwork.__table__.c.method_id == n).
                                                where(ExpressionNetwork.__table__.c.sequence_id.isnot(None))
                                                ).fetchall()

            for sequence, network, network_method_id in current_network:
                if sequence is not None:
                    sequence_network[int(sequence)] = network
                    sequence_network_method[int(sequence)] = int(network_method_id)

        # Get family data and store in dictionary
        current_families = db.engine.execute(db.select([SequenceFamilyAssociation.__table__.c.sequence_id,
                                                        SequenceFamilyAssociation.__table__.c.gene_family_id,
                                                        GeneFamily.__table__.c.method_id]).
                                             select_from(SequenceFamilyAssociation.__table__.join(GeneFamily.__table__)).
                                             where(GeneFamily.__table__.c.method_id == gene_family_method_id)
                                             ).fetchall()

        for sequence, family, method in current_families:
            sequence_family[int(sequence)] = int(family)

            if family not in family_sequence.keys():
                family_sequence[int(family)] = []

            family_sequence[int(family)].append(int(sequence))

        # Create a dict (key = network) with the families present in that network
        # Families that occur multiple times should be present multiple times as this is used
        # to set threshholds later !

        for sequence, network_method in sequence_network_method.items():
            # ignore sequences without a family, ideally this shouldn't happen
            if network_method not in network_families.keys():
                network_families[network_method] = []

            if sequence in sequence_family.keys():
                family = sequence_family[sequence]
                network_families[network_method].append(family)

        # Determine threshold and p-value
        # A background model will be computed for each combination of networks, an ECC score will need to be better
        # than 95 % of the randomly found values to be considered significant

        thresholds = {}
        print("Starting permutation tests")
        for n in network_method_ids:
            thresholds[n] = {}
            for m in network_method_ids:
                thresholds[n][m] = ExpressionNetworkMethod.__set_thresholds(network_families[n],
                                                                            network_families[m],
                                                                            max_size=max_size)

        # Data loaded start calculating ECCs
        new_ecc_scores = []

        for family, sequences in family_sequence.items():
            for i in range(len(sequences) - 1):
                query = sequences[i]
                for j in range(i+1, len(sequences)):
                    target = sequences[j]
                    if query in sequence_network.keys() and target in sequence_network.keys() and query != target:
                        # Ignore genes with overlapping neighborhoods
                        if not ExpressionNetworkMethod.__neighborhoods_overlap(sequence_network[query], sequence_network[target]):
                            ecc, significant = ExpressionNetworkMethod.__ecc(sequence_network[query],
                                                                             sequence_network[target],
                                                                             sequence_family,
                                                                             thresholds[sequence_network_method[query]][sequence_network_method[target]],
                                                                             family,
                                                                             max_size=max_size)
                            if significant:
                                new_ecc_scores.append({
                                    'query_id': query,
                                    'target_id': target,
                                    'ecc': ecc,
                                    'gene_family_method_id': gene_family_method_id,
                                    'query_network_method_id': sequence_network_method[query],
                                    'target_network_method_id': sequence_network_method[target],
                                })

                                # add reciprocal relation
                                new_ecc_scores.append({
                                    'query_id': target,
                                    'target_id': query,
                                    'ecc': ecc,
                                    'gene_family_method_id': gene_family_method_id,
                                    'query_network_method_id': sequence_network_method[target],
                                    'target_network_method_id': sequence_network_method[query],
                                })
                                if len(new_ecc_scores) > 400:
                                    db.engine.execute(SequenceSequenceECCAssociation.__table__.insert(), new_ecc_scores)
                                    new_ecc_scores = []

        db.engine.execute(SequenceSequenceECCAssociation.__table__.insert(), new_ecc_scores)

    @staticmethod
    def __neighborhoods_overlap(neighborhood_a, neighborhood_b):
        """
        Checks if two genes have overlapping networks

        :param neighborhood_a: neighborhood for first gene (string as stored in database)
        :param neighborhood_b: neighborhood for second gene (string as stored in database)
        :return: Bool, true if networks overlap
        """
        genes_a = set([n['gene_id'] for n in json.loads(neighborhood_a) if n['gene_id'] is not None])
        genes_b = set([n['gene_id'] for n in json.loads(neighborhood_b) if n['gene_id'] is not None])

        return len(genes_a.intersection(genes_b)) > 0

    @staticmethod
    def __ecc(q_network, t_network, families, thresholds, query_family, max_size=30):
        """
        Takes the networks neighborhoods (as stored in the databases), extracts the genes and find the families for
        each gene. Next the ECC score is calculated

        :param q_network: network for the query gene
        :param t_network: network for the target gene
        :param families: dictionary that links a sequence id (key) to a family id (value)
        :param thresholds:
        :param query_family: name of the input gene family
        :return: the ECC score for the two input neighborhoods given the families, a boolean flag if this is significant
        """
        q_data = json.loads(q_network)
        t_data = json.loads(t_network)

        q_genes = [t['gene_id'] for t in q_data if t['gene_id'] is not None]
        t_genes = [t['gene_id'] for t in t_data if t['gene_id'] is not None]

        q_families = [families[q] for q in q_genes if q in families.keys() and families[q] != query_family]
        t_families = [families[t] for t in t_genes if t in families.keys() and families[t] != query_family]

        # print("***\nQuery %d\n%s\n%s" % (query_family, ','.join([str(q) for q in q_families]), ','.join([str(t) for t in t_families])))

        if len(q_families) == 0 or len(t_families) == 0:
            return 0.0, False
        else:
            ecc = jaccard(q_families, t_families)

            q_size = len(set(q_families)) if len(set(q_families)) < max_size else max_size
            t_size = len(set(t_families)) if len(set(t_families)) < max_size else max_size

            t = thresholds[q_size-1][t_size-1]

            return ecc, ecc > t

    @staticmethod
    @benchmark
    def __set_thresholds(families_a, families_b, max_size=30, iterations=1000, step=5):
        """
        Empirically determine (permutation test) thresholds for ECC

        :param families_a: families of species_a (list of internal family ids)
        :param families_b: families of species_b (list of internal family ids)
        :param max_size: maximum number of families (default = 30)
        :param iterations: number of permutations done
        :param step: step size
        :return: matrix (list of lists) with the thresholds at various family sizes
        """
        thresholds = []

        for i in range(0, max_size, step):
            print("%d done" % i)
            new_threshholds = []
            for j in range(0, max_size, step):
                scores = []
                for _ in range(iterations):
                    if i+1 < len(families_a) and j+1 < len(families_b):
                        i_fams = random.sample(families_a, i+1)
                        j_fams = random.sample(families_b, j+1)
                        scores.append(jaccard(i_fams, j_fams))
                    else:
                        # Cannot calculate threshold with these families, add 1
                        scores.append(1)

                # TODO (maybe?): cutoff is hard coded here, replace ?
                print(iterations, len(scores), scores)
                scores = sorted(scores)
                for _ in range(step):
                    new_threshholds.append(scores[int(iterations*0.95)])
            for _ in range(step):
                thresholds.append(new_threshholds)

        return thresholds

Esempio n. 10

Mostra file

class Species(db.Model):
    __tablename__ = 'species'
    id = db.Column(db.Integer, primary_key=True)
    code = db.Column(db.String(50, collation=SQL_COLLATION), unique=True)
    name = db.Column(db.String(200, collation=SQL_COLLATION))
    data_type = db.Column(db.Enum('genome', 'transcriptome', name='data_type'))
    color = db.Column(db.String(7), default="#C7C7C7")
    highlight = db.Column(db.String(7), default="#DEDEDE")
    sequence_count = db.Column(db.Integer)
    network_count = db.Column(db.Integer)
    profile_count = db.Column(db.Integer)
    description = db.Column(db.Text)

    sequences = db.relationship('Sequence',
                                backref='species',
                                lazy='dynamic',
                                cascade="all, delete-orphan",
                                passive_deletes=True)
    networks = db.relationship('ExpressionNetworkMethod',
                               backref='species',
                               lazy='dynamic',
                               cascade="all, delete-orphan",
                               passive_deletes=True)
    profiles = db.relationship('ExpressionProfile',
                               backref='species',
                               lazy='dynamic',
                               cascade="all, delete-orphan",
                               passive_deletes=True)
    expression_specificities = db.relationship('ExpressionSpecificityMethod',
                                               backref='species',
                                               lazy='dynamic',
                                               cascade="all, delete-orphan",
                                               passive_deletes=True)
    condition_tissues = db.relationship('ConditionTissue',
                                        backref='species',
                                        lazy='dynamic',
                                        cascade="all, delete-orphan",
                                        passive_deletes=True)

    def __init__(self,
                 code,
                 name,
                 data_type='genome',
                 color="#C7C7C7",
                 highlight="#DEDEDE",
                 description=None):
        self.code = code
        self.name = name
        self.data_type = data_type
        self.color = color
        self.highlight = highlight
        self.sequence_count = 0
        self.profile_count = 0
        self.network_count = 0
        self.description = description

    def __repr__(self):
        return str(self.id) + ". " + self.name

    @property
    def has_interpro(self):
        from conekt.models.sequences import Sequence
        from conekt.models.relationships.sequence_interpro import SequenceInterproAssociation

        domain = SequenceInterproAssociation.query.join(
            Sequence,
            Sequence.id == SequenceInterproAssociation.sequence_id).filter(
                Sequence.species_id == self.id).first()

        if domain is not None:
            return True
        else:
            return False

    @property
    def has_go(self):
        from conekt.models.sequences import Sequence
        from conekt.models.relationships.sequence_go import SequenceGOAssociation

        go = SequenceGOAssociation.query.join(
            Sequence, Sequence.id == SequenceGOAssociation.sequence_id).filter(
                Sequence.species_id == self.id).first()

        if go is not None:
            return True
        else:
            return False

    @staticmethod
    def add(code,
            name,
            data_type='genome',
            color="#C7C7C7",
            highlight="#DEDEDE",
            description=None):

        new_species = Species(code,
                              name,
                              data_type=data_type,
                              color=color,
                              highlight=highlight,
                              description=description)

        species = Species.query.filter_by(code=code).first()

        # species is not in the DB yet, add it
        if species is None:
            try:
                db.session.add(new_species)
                db.session.commit()
            except:
                db.rollback()

            return new_species.id
        else:
            return species.id

    @staticmethod
    def update_counts():
        """
        To avoid long counts the number of sequences, profiles and networks can be precalculated and stored in the
        database using this function.
        """
        species = Species.query.all()

        for s in species:
            s.sequence_count = s.sequences.count()
            s.profile_count = s.profiles.count()
            s.network_count = s.networks.count()

        try:
            db.session.commit()
        except Exception as e:
            db.session.rollback()
            print(e)

Esempio n. 11

Mostra file

class CoexpressionClusteringMethod(db.Model):
    __tablename__ = 'coexpression_clustering_methods'
    id = db.Column(db.Integer, primary_key=True)
    network_method_id = db.Column(db.Integer,
                                  db.ForeignKey(
                                      'expression_network_methods.id',
                                      ondelete='CASCADE'),
                                  index=True)
    method = db.Column(db.Text)
    cluster_count = db.Column(db.Integer)

    clusters = db.relationship('CoexpressionCluster',
                               backref=db.backref('method', lazy='joined'),
                               lazy='dynamic',
                               cascade="all, delete-orphan",
                               passive_deletes=True)

    @staticmethod
    def update_counts():
        """
        To avoid long counts the number of clusters per method can be precalculated and stored in the database
        using this function
        """
        methods = CoexpressionClusteringMethod.query.all()

        for m in methods:
            m.cluster_count = m.clusters.count()

        try:
            db.session.commit()
        except Exception as e:
            db.session.rollback()
            print(e)

    @staticmethod
    def clusters_from_neighborhoods(method, network_method_id):
        probes = ExpressionNetwork.query.filter_by(
            method_id=network_method_id).all()  # Load all probes

        clusters = defaultdict(list)
        clusters_orm = {}

        sequence_to_probe = {}

        for p in probes:
            # Only consider probes linked with sequences
            if p.sequence_id is not None:
                sequence_to_probe[p.sequence_id] = p.probe
                neighborhood = json.loads(p.network)
                sequence_ids = [
                    n["gene_id"] for n in neighborhood
                    if "gene_id" in n.keys() and n["gene_id"] is not None
                ]

                # check if there are neighbors for this sequence
                if len(sequence_ids) > 0:
                    clusters[p.sequence.name] = [p.sequence_id] + sequence_ids

        # If there are valid clusters add them to the database
        if len(clusters) > 0:

            # Add new method first
            new_method = CoexpressionClusteringMethod()

            new_method.network_method_id = network_method_id
            new_method.method = method
            new_method.cluster_count = len(clusters)

            db.session.add(new_method)

            try:
                db.session.commit()
            except Exception as e:
                db.session.rollback()
                print(e)

        # Add Clusters
        for cluster in clusters.keys():
            clusters_orm[cluster] = CoexpressionCluster()
            clusters_orm[cluster].method_id = new_method.id
            clusters_orm[cluster].name = cluster
            db.session.add(clusters_orm[cluster])

            if len(clusters_orm) % 400 == 0:
                try:
                    db.session.commit()
                except Exception as e:
                    db.session.rollback()
                    print(e)
        try:
            db.session.commit()
        except Exception as e:
            db.session.rollback()
            print(e)

        # Add sequence cluster relations
        for i, (cluster, members) in enumerate(clusters.items()):
            for sequence_id in members:
                relation = SequenceCoexpressionClusterAssociation()
                relation.sequence_id = sequence_id
                relation.coexpression_cluster_id = clusters_orm[cluster].id
                relation.probe = sequence_to_probe[
                    sequence_id] if sequence_id in sequence_to_probe.keys(
                    ) else None

                db.session.add(relation)

            if i % 20 == 0:
                try:
                    db.session.commit()
                except Exception as e:
                    db.session.rollback()
                    print(e)
        try:
            db.session.commit()
        except Exception as e:
            db.session.rollback()
            print(e)

    @staticmethod
    def build_hcca_clusters(method,
                            network_method_id,
                            step_size=3,
                            hrr_cutoff=30,
                            min_cluster_size=40,
                            max_cluster_size=200):
        """
        method to build HCCA clusters for a certain network

        :param method: Name for the current clustering method
        :param network_method_id: ID for the network to cluster
        :param step_size: desired step_size for the HCCA algorithm
        :param hrr_cutoff: desired hrr_cutoff for the HCCA algorithm
        :param min_cluster_size: minimal cluster size
        :param max_cluster_size: maximum cluster size
        """

        network_data = {}

        sequence_probe = {}

        # Get network from DB
        print("Loading Network data from DB...", sep='')
        ExpressionNetworkMethod.query.get_or_404(
            network_method_id)  # Check if method exists

        probes = ExpressionNetwork.query.filter_by(
            method_id=network_method_id).all()  # Load all probes

        for p in probes:
            # Loop over probes and store hrr for all neighbors
            if p.sequence_id is not None:
                neighborhood = json.loads(p.network)
                network_data[p.sequence_id] = {
                    nb["gene_id"]: nb["hrr"]
                    for nb in neighborhood if "gene_id" in nb.keys()
                    and "hrr" in nb.keys() and nb["gene_id"] is not None
                }

                sequence_probe[p.sequence_id] = p.probe

        # Double check edges are reciprocally defined
        for sequence, data in network_data.items():
            for neighbor, score in data.items():
                if neighbor not in network_data.keys():
                    network_data[neighbor] = {sequence: score}
                else:
                    if sequence not in network_data[neighbor].keys():
                        network_data[neighbor][sequence] = score

        print("Done!\nStarting to build Clusters...\n")

        # Build clusters
        hcca_util = HCCA(step_size=step_size,
                         hrr_cutoff=hrr_cutoff,
                         min_cluster_size=min_cluster_size,
                         max_cluster_size=max_cluster_size)

        hcca_util.load_data(network_data)

        hcca_util.build_clusters()

        # Add new method to DB
        clusters = list(set([t[1] for t in hcca_util.clusters]))
        if len(clusters) > 0:
            print("Done building clusters, adding clusters to DB")

            # Add new method first
            new_method = CoexpressionClusteringMethod()

            new_method.network_method_id = network_method_id
            new_method.method = method
            new_method.cluster_count = len(clusters)

            db.session.add(new_method)

            try:
                db.session.commit()
            except Exception as e:
                db.session.rollback()
                print(e)

            # Add cluster and store as dict
            cluster_dict = {}

            for c in clusters:
                cluster_dict[c] = CoexpressionCluster()
                cluster_dict[c].method_id = new_method.id
                cluster_dict[c].name = c

                db.session.add(cluster_dict[c])

            try:
                db.session.commit()
            except Exception as e:
                db.session.rollback()
                print(e)

            # Link sequences to clusters
            for i, t in enumerate(hcca_util.clusters):
                gene_id, cluster_name, _ = t

                relation = SequenceCoexpressionClusterAssociation()

                relation.probe = sequence_probe[
                    gene_id] if gene_id in sequence_probe.keys() else None
                relation.sequence_id = gene_id
                relation.coexpression_cluster_id = cluster_dict[
                    cluster_name].id if cluster_name in cluster_dict.keys(
                    ) else None

                if relation.coexpression_cluster_id is not None:
                    db.session.add(relation)

                if i > 0 and i % 400 == 0:
                    # Add relations in sets of 400
                    try:
                        db.session.commit()
                    except Exception as e:
                        db.session.rollback()
                        print(e)

            # Add remaining relations
            try:
                db.session.commit()
            except Exception as e:
                db.session.rollback()
                print(e)

        else:
            print("No clusters found! Not adding anything to DB !")

    @staticmethod
    def add_lstrap_coexpression_clusters(cluster_file,
                                         description,
                                         network_id,
                                         prefix='cluster_',
                                         min_size=10):
        """
        Adds MCL clusters, as produced by LSTrAP, to the database

        :param cluster_file: path to file with clusters
        :param description: description to add to database for this set of clusters
        :param network_id: network the clusters are based on
        :param prefix: prefix for individual clsuter names (default 'cluster_')
        :param min_size: minimal size of a cluster (default = 10)
        :return: ID of new clustering method
        """
        # get all sequences from the database and create a dictionary
        sequences = Sequence.query.all()

        sequence_dict = {}
        for member in sequences:
            sequence_dict[member.name.upper()] = member

        # add coexpression clustering method to the database
        clustering_method = CoexpressionClusteringMethod()

        clustering_method.network_method_id = network_id
        clustering_method.method = description

        try:
            db.session.add(clustering_method)
            db.session.commit()
        except Exception as e:
            db.session.rollback()
            print(e)
            quit()

        with open(cluster_file) as f:
            i = 1
            for line in f:
                probes = [p for p in line.strip().split()]
                genes = [p.replace('.1', '') for p in probes]
                cluster_id = "%s%04d" % (prefix, i)

                if len(probes) >= min_size:
                    i += 1

                    new_cluster = CoexpressionCluster()
                    new_cluster.method_id = clustering_method.id
                    new_cluster.name = cluster_id

                    db.session.add(new_cluster)

                    try:
                        db.session.commit()
                    except Exception as e:
                        db.session.rollback()
                        print(e)
                        continue

                    for p, g in zip(probes, genes):
                        new_association = SequenceCoexpressionClusterAssociation(
                        )
                        new_association.probe = p
                        new_association.sequence_id = None
                        if g.upper() in sequence_dict.keys():
                            new_association.sequence_id = sequence_dict[
                                g.upper()].id
                        new_association.coexpression_cluster_id = new_cluster.id
                        db.session.add(new_association)
                    try:
                        db.session.commit()
                    except Exception as e:
                        db.session.rollback()
                        print(e)

        return clustering_method.id

Esempio n. 12

Mostra file

class TreeMethod(db.Model):
    __tablename__ = 'tree_methods'
    id = db.Column(db.Integer, primary_key=True)

    description = db.Column(db.Text)

    gene_family_method_id = db.Column(db.Integer,
                                      db.ForeignKey('gene_family_methods.id',
                                                    ondelete='CASCADE'),
                                      index=True)

    trees = db.relationship('Tree',
                            backref=db.backref('method', lazy='joined'),
                            lazy='dynamic',
                            passive_deletes=True)

    def reconcile_trees(self):
        print("\n1.====================Getting into function reconcile_trees")
        # Fetch required data from the database
        sequences = Sequence.query.all()
        #print("\n1.1.=============================Sequences Joined: " + ', '.join(sequences)) #FAILS, bad print statement for list obj
        clades = Clade.query.all()
        #print("\n1.2. =========================Clades: ", *clades, sep='\n') # print works

        seq_to_species = {s.name: s.species.code for s in sequences}
        #print("\n2.=========================seq_to_species: ", *seq_to_species, sep='::')
        seq_to_id = {s.name: s.id for s in sequences}
        clade_to_species = {c.name: json.loads(c.species) for c in clades}
        clade_to_id = {c.name: c.id for c in clades}

        new_associations = []

        phyloxml_data = {}

        for t in self.trees:
            # Load tree from Newick string and start reconciliating
            tree = newick.loads(t.data_newick)[0]
            print("\n3.=========================tree loaded ok")

            for node in tree.walk():
                if len(node.descendants) != 2:
                    #print("\n4.==========length of node descendant=" + str(len(node.descendants)))
                    if not node.is_binary:
                        print("\n5.================Non-Binary-node: " +
                              str(node.is_binary))
                        # Print warning in case there is a non-binary node
                        #sdash: commenting out this original print statement because none binary-node doesn't have id nor label. Process stops at this print statement for non-binary trees.

                        print(
                            "Non-Binary tree: " + t.data_newick
                        )  #sdash: this print statement will show which tree is non-binary and is skipped. Doesn't stop the reconcile process.
                        #sdash May-03-2019#original#
                        #print("[%d, %s] Skipping node... Can only reconcile binary nodes ..." % (tree.id, tree.label))
                    # Otherwise it is a leaf node and can be skipped
                    continue

                branch_one_seq = [
                    l.name.strip() for l in node.descendants[0].get_leaves()
                ]
                # print("\n6.===============Branch-one-seq: " + ', '.join(branch_one_seq))
                branch_two_seq = [
                    l.name.strip() for l in node.descendants[1].get_leaves()
                ]
                # print("\n7.===============Branch-two-seq: " + ', '.join(branch_two_seq))

                branch_one_species = set([
                    seq_to_species[s] for s in branch_one_seq
                    if s in seq_to_species.keys()
                ])
                print(
                    "\n8.===============Branch-one-spp: " +
                    ', '.join(branch_one_species)
                )  #Empty set, length=0; seq_to_species length=143271; SO, problem in forming this set definition
                ## TO DO:
                #Possibly the seq name seq_to_species doesn't match in branch_one_seq and
                #  hence, it is an empty set.  Next check this possibility. Tue June 25.

                branch_two_species = set([
                    seq_to_species[s] for s in branch_two_seq
                    if s in seq_to_species.keys()
                ])
                print("\n9.===============Branch-two-spp: " +
                      ', '.join(branch_two_species))

                all_species = branch_one_species.union(branch_two_species)

                clade, _ = phylo.get_clade(all_species, clade_to_species)
                duplication = phylo.is_duplication(branch_one_species,
                                                   branch_two_species,
                                                   clade_to_species)

                duplication_consistency = None
                if duplication:
                    duplication_consistency = phylo.duplication_consistency(
                        branch_one_species, branch_two_species)

                tags = [
                    clade_to_id[clade] if clade is not None else 0,
                    'D' if duplication else 'S',
                    duplication_consistency if duplication else 0
                ]

                node.name = '_'.join([str(t) for t in tags])

                if clade is not None:
                    for seq_one in branch_one_seq:
                        for seq_two in branch_two_seq:
                            new_associations.append({
                                'sequence_one_id':
                                seq_to_id[seq_one],
                                'sequence_two_id':
                                seq_to_id[seq_two],
                                'tree_id':
                                t.id,
                                'clade_id':
                                clade_to_id[clade],
                                'duplication':
                                1 if duplication else 0,
                                'duplication_consistency_score':
                                duplication_consistency
                            })
                            new_associations.append({
                                'sequence_one_id':
                                seq_to_id[seq_two],
                                'sequence_two_id':
                                seq_to_id[seq_one],
                                'tree_id':
                                t.id,
                                'clade_id':
                                clade_to_id[clade],
                                'duplication':
                                1 if duplication else 0,
                                'duplication_consistency_score':
                                duplication_consistency
                            })

            if len(new_associations) > 400:
                db.engine.execute(
                    SequenceSequenceCladeAssociation.__table__.insert(),
                    new_associations)
                new_associations = []

            # add newick tree to memory
            phyloxml_data[t.id] = newick.dumps([tree])

        db.engine.execute(SequenceSequenceCladeAssociation.__table__.insert(),
                          new_associations)

        # Update PhyloXML data file for all trees
        for t in self.trees:
            if t.id in phyloxml_data.keys():
                t.data_phyloxml = phyloxml_data[t.id]

        db.session.commit()

Esempio n. 13

Mostra file

class GO(db.Model):
    __tablename__ = 'go'
    id = db.Column(db.Integer, primary_key=True)
    label = db.Column(db.String(50, collation=SQL_COLLATION), unique=True, index=True)
    name = db.Column(db.Text)
    type = db.Column(db.Enum('biological_process', 'molecular_function', 'cellular_component', name='go_type'))
    description = db.Column(db.Text)
    obsolete = db.Column(db.SmallInteger)
    is_a = db.Column(db.Text)
    extended_go = db.Column(db.Text)
    species_counts = db.Column(db.Text)

    sequences = db.relationship('Sequence', secondary=sequence_go, lazy='dynamic')

    # Other properties
    #
    # sequence_associations declared in 'SequenceGOAssociation'
    # enriched_clusters declared in 'ClusterGOEnrichment'

    def __init__(self, label, name, go_type, description, obsolete, is_a, extended_go):
        self.label = label
        self.name = name
        self.type = go_type
        self.description = description
        self.obsolete = obsolete
        self.is_a = is_a
        self.extended_go = extended_go
        self.species_counts = ""

    def set_all(self, label, name, go_type, description, extended_go):
        self.label = label
        self.name = name
        self.type = go_type
        self.description = description
        self.extended_go = extended_go
        self.species_counts = ""

    @property
    def short_type(self):
        if self.type == 'biological_process':
            return 'BP'
        elif self.type == 'molecular_function':
            return 'MF'
        elif self.type == 'cellular_component':
            return 'CC'
        else:
            return 'UNK'

    @property
    def readable_type(self):
        if self.type == 'biological_process':
            return 'Biological process'
        elif self.type == 'molecular_function':
            return 'Molecular function'
        elif self.type == 'cellular_component':
            return 'Cellular component'
        else:
            return 'Unknown type'

    @property
    def parent_count(self):
        """
        Returns total number of genes 'above' this gene in the DAG
        :return:
        """
        return len(self.extended_go.split(';')) if self.extended_go != '' else 0

    @property
    def interpro_stats(self):
        from conekt.models.interpro import Interpro

        return Interpro.sequence_stats_subquery(self.sequences)

    @property
    def go_stats(self):
        return GO.sequence_stats_subquery(self.sequences)

    @property
    def family_stats(self):
        from conekt.models.gene_families import GeneFamily

        return GeneFamily.sequence_stats_subquery(self.sequences)

    def species_occurrence(self, species_id):
        """
        count how many genes have the current GO term in a given species

        :param species_id: internal id of the selected species
        :return: count of sequences with this term associated
        """
        count = 0
        sequences = self.sequences.all()

        for s in sequences:
            if s.species_id == species_id:
                count += 1

        return count

    @staticmethod
    def sequence_stats(sequence_ids, exclude_predicted=True):
        """
        Takes a list of sequence IDs and returns InterPro stats for those sequences

        :param sequence_ids: list of sequence ids
        :param exclude_predicted: if True (default) predicted GO labels will be excluded
        :return: dict with for each InterPro domain linked with any of the input sequences stats
        """
        query = SequenceGOAssociation.query.filter(SequenceGOAssociation.sequence_id.in_(sequence_ids))

        if exclude_predicted:
            query = query.filter(SequenceGOAssociation.predicted == 0)

        data = query.all()

        return GO.__sequence_stats_associations(data)

    @staticmethod
    def sequence_stats_subquery(sequences, exclude_predicted=True):
        subquery = sequences.subquery()

        query = SequenceGOAssociation.query

        if exclude_predicted:
            query = query.filter(SequenceGOAssociation.predicted == 0)

        data = query.join(subquery, SequenceGOAssociation.sequence_id == subquery.c.id).all()

        return GO.__sequence_stats_associations(data)

    @staticmethod
    def __sequence_stats_associations(associations):
        output = {}
        for d in associations:
            if d.go_id not in output.keys():
                output[d.go_id] = {
                    'go': d.go,
                    'count': 1,
                    'sequences': [d.sequence_id],
                    'species': [d.sequence.species_id]
                }
            else:
                output[d.go_id]['count'] += 1
                if d.sequence_id not in output[d.go_id]['sequences']:
                    output[d.go_id]['sequences'].append(d.sequence_id)
                if d.sequence.species_id not in output[d.go_id]['species']:
                    output[d.go_id]['species'].append(d.sequence.species_id)

        for k, v in output.items():
            v['species_count'] = len(v['species'])
            v['sequence_count'] = len(v['sequences'])

        return output

    @staticmethod
    def update_species_counts():
        """
        Adds phylo-profile to each go-label, results are stored in the database

        :param exclude_predicted: if True (default) predicted GO labels will be excluded
        """
        # link species to sequences
        sequences = db.engine.execute(db.select([Sequence.__table__.c.id, Sequence.__table__.c.species_id])).fetchall()

        sequence_to_species = {}
        for seq_id, species_id in sequences:
            if species_id is not None:
                sequence_to_species[seq_id] = int(species_id)

        # get go for all genes
        associations = db.engine.execute(
            db.select([SequenceGOAssociation.__table__.c.sequence_id,
                       SequenceGOAssociation.__table__.c.go_id], distinct=True)\
            .where(SequenceGOAssociation.__table__.c.predicted == 0))\
            .fetchall()

        count = {}
        for seq_id, go_id in associations:
            species_id = sequence_to_species[seq_id]

            if go_id not in count.keys():
                count[go_id] = {}

            if species_id not in count[go_id]:
                count[go_id][species_id] = 1
            else:
                count[go_id][species_id] += 1

        # update counts
        for go_id, data in count.items():
            db.engine.execute(db.update(GO.__table__)
                              .where(GO.__table__.c.id == go_id)
                              .values(species_counts=json.dumps(data)))

    @staticmethod
    def add_from_obo(filename, empty=True, compressed=False):
        """
        Parses GeneOntology's OBO file and adds it to the database

        :param filename: Path to the OBO file to parse
        :param compressed: load data from .gz file if true (default: False)
        :param empty: Empty the database first when true (default: True)
        """
        # If required empty the table first
        if empty:
            try:
                db.session.query(GO).delete()
                db.session.commit()
            except Exception as e:
                db.session.rollback()
                print(e)

        obo_parser = OBOParser()
        obo_parser.readfile(filename, compressed=compressed)

        obo_parser.extend_go()

        for i, term in enumerate(obo_parser.terms):
            go = GO(term.id, term.name, term.namespace, term.definition, term.is_obsolete, ";".join(term.is_a),
                    ";".join(term.extended_go))

            db.session.add(go)

            if i % 40 == 0:
                # commit to the db frequently to allow WHOOSHEE's indexing function to work without timing out
                try:
                    db.session.commit()
                except Exception as e:
                    db.session.rollback()
                    print(e)
        try:
            db.session.commit()
        except Exception as e:
            db.session.rollback()
            print(e)

    @staticmethod
    def add_go_from_plaza(filename):
        """
        Adds GO annotation from PLAZA 3.0 to the database

        :param filename: Path to the annotation file
        :return:
        """
        go_parser = GOParser()

        go_parser.read_plaza_go(filename)

        gene_hash = {}
        go_hash = {}

        all_sequences = Sequence.query.all()
        all_go = GO.query.all()

        for sequence in all_sequences:
            gene_hash[sequence.name] = sequence

        for term in all_go:
            go_hash[term.label] = term

        associations = []

        for gene, terms in go_parser.annotation.items():
            if gene in gene_hash.keys():
                current_sequence = gene_hash[gene]
                for term in terms:
                    if term["id"] in go_hash.keys():
                        current_term = go_hash[term["id"]]
                        association = {
                            "sequence_id": current_sequence.id,
                            "go_id": current_term.id,
                            "evidence": term["evidence"],
                            "source": term["source"]}
                        associations.append(association)
                    else:
                        print(term, "not found in the database.")
            else:
                print("Gene", gene, "not found in the database.")

            if len(associations) > 400:
                db.engine.execute(SequenceGOAssociation.__table__.insert(), associations)
                associations = []

        # Add extended GOs
        for gene, terms in go_parser.annotation.items():
            if gene in gene_hash.keys():
                current_sequence = gene_hash[gene]
                new_terms = []
                current_terms = []

                for term in terms:
                    if term["id"] not in current_terms:
                        current_terms.append(term["id"])

                for term in terms:
                    if term["id"] in go_hash.keys():
                        extended_terms = go_hash[term["id"]].extended_go.split(";")
                        for extended_term in extended_terms:
                            if extended_term not in current_terms and extended_term not in new_terms:
                                new_terms.append(extended_term)

                for new_term in new_terms:
                    if new_term in go_hash.keys():
                        current_term = go_hash[new_term]
                        association = {
                            "sequence_id": current_sequence.id,
                            "go_id": current_term.id,
                            "evidence": None,
                            "source": "Extended"}
                        associations.append(association)

                    if len(associations) > 400:
                        db.engine.execute(SequenceGOAssociation.__table__.insert(), associations)
                        associations = []

        db.engine.execute(SequenceGOAssociation.__table__.insert(), associations)

    @staticmethod
    def add_go_from_tab(filename, species_id, source="Source not provided"):
        gene_hash = {}
        go_hash = {}

        all_sequences = Sequence.query.filter_by(species_id=species_id).all()
        all_go = GO.query.all()

        for sequence in all_sequences:
            gene_hash[sequence.name] = sequence

        for term in all_go:
            go_hash[term.label] = term

        associations = []

        gene_go = defaultdict(list)

        with open(filename, "r") as f:
            for line in f:
                gene, term, evidence = line.strip().split('\t')
                if gene in gene_hash.keys():
                    current_sequence = gene_hash[gene]
                    if term in go_hash.keys():
                        current_term = go_hash[term]
                        association = {
                            "sequence_id": current_sequence.id,
                            "go_id": current_term.id,
                            "evidence": evidence,
                            "source": source}
                        associations.append(association)

                        if term not in gene_go[gene]:
                            gene_go[gene].append(term)

                    else:
                        print(term, "not found in the database.")
                else:
                    print("Gene", gene, "not found in the database.")

                if len(associations) > 400:
                    db.engine.execute(SequenceGOAssociation.__table__.insert(), associations)
                    associations = []

        # Add extended GOs
        for gene, terms in gene_go.items():
            if gene in gene_hash.keys():
                current_sequence = gene_hash[gene]
                new_terms = []
                current_terms = []

                for term in terms:
                    if term not in current_terms:
                        current_terms.append(term)

                for term in terms:
                    if term in go_hash.keys():
                        extended_terms = go_hash[term].extended_go.split(";")
                        for extended_term in extended_terms:
                            if extended_term not in current_terms and extended_term not in new_terms:
                                new_terms.append(extended_term)

                for new_term in new_terms:
                    if new_term in go_hash.keys():
                        current_term = go_hash[new_term]
                        association = {
                            "sequence_id": current_sequence.id,
                            "go_id": current_term.id,
                            "evidence": None,
                            "source": "Extended"}
                        associations.append(association)

                    if len(associations) > 400:
                        db.engine.execute(SequenceGOAssociation.__table__.insert(), associations)
                        associations = []

        db.engine.execute(SequenceGOAssociation.__table__.insert(), associations)

    @staticmethod
    def predict_from_network(expression_network_method_id, threshold=5, source="PlaNet Prediction"):
        """
        Function to transfer GO terms from neighbors in the network. If n or more (based on threshold) neighbors have a
        GO label (excluding other predicted labels) the term is transferred.

        :param expression_network_method_id: Expression network as input
        :param threshold: number of neighboring genes that should have the label to allow transfor
        :param source: Value for the source field
        """
        from conekt.models.expression.networks import ExpressionNetworkMethod

        expression_network_method = ExpressionNetworkMethod.query.get(expression_network_method_id)

        if expression_network_method is None:
            print("ERROR: Network Method ID %d not found" % expression_network_method_id)
            return

        # Get all genes that belong to the network
        probes = expression_network_method.probes.all()

        new_associations = []

        for i, probe in enumerate(probes):
            print("Predicting GO for gene: %d, %s (%d out of %d)" %
                  (probe.sequence_id, probe.sequence.name, i, expression_network_method.probe_count))

            # Get neighborhood from database
            neighborhood = json.loads(probe.network)

            # Get sequence ids from genes in first level neighborhood
            sequence_ids = [n['gene_id'] for n in neighborhood if 'gene_id' in n]

            # If the number of genes in the neighborhood is smaller than the threshold skip (no prediction possible)
            # If there is no sequence associated with the probe skip as well
            if len(sequence_ids) < threshold or probe.sequence_id is None:
                continue

            # Get own GO terms
            own_associations = SequenceGOAssociation.query.filter(SequenceGOAssociation.sequence_id == probe.sequence_id)
            own_terms = list(set([a.go_id for a in own_associations]))

            # Get GO terms from neighbors
            associations = SequenceGOAssociation.query.filter(SequenceGOAssociation.sequence_id.in_(sequence_ids)).\
                filter(SequenceGOAssociation.predicted == 0).all()

            # Make GO terms from neighbors unique and ignore terms the current gene has already
            unique_associations = set([(a.sequence_id, a.go_id) for a in associations if a.go_id not in own_terms])

            go_counts = defaultdict(lambda: 0)

            for ua in unique_associations:
                go_counts[ua[1]] += 1

            # Determine new terms (that occurred equal or more times than the desired threshold
            new_terms = [{
                'go_id': k,
                'score': v
            } for k, v in go_counts.items() if v >= threshold]

            # Store new terms in a list that can be added to the database
            for nt in new_terms:
                new_associations.append({
                    'sequence_id': probe.sequence_id,
                    'go_id': nt['go_id'],
                    'evidence': 'IEP',
                    'source': source,
                    'predicted': True,
                    'prediction_data': json.dumps({'score': nt['score'],
                                                   'threshold': threshold,
                                                   'network_method': expression_network_method_id,
                                                   'prediction_method': 'Neighbor counting'
                                                   })
                })

        # Add new labels to the database in chuncks of 400
        for i in range(0, len(new_associations), 400):
            db.engine.execute(SequenceGOAssociation.__table__.insert(), new_associations[i: i + 400])

    @staticmethod
    def predict_from_network_enrichment(expression_network_method_id, cutoff=0.05, source="PlaNet Prediction"):
        from conekt.models.expression.networks import ExpressionNetworkMethod

        expression_network_method = ExpressionNetworkMethod.query.get(expression_network_method_id)

        if expression_network_method is None:
            print("ERROR: Network Method ID %d not found" % expression_network_method_id)
            return

        probes = expression_network_method.probes.all()

        # Get all GO terms and get background
        # Important, counts are obtained from precomputed counts in the species_counts field !!
        go_data = db.engine.execute(db.select([GO.__table__.c.id, GO.__table__.c.species_counts])).fetchall()

        go_background = defaultdict(lambda: 0)

        for go_id, counts_json in go_data:
            if counts_json is not "":
                counts = json.loads(counts_json)
                if str(expression_network_method.species_id) in counts.keys():
                    go_background[go_id] = counts[str(expression_network_method.species_id)]

        new_associations = []

        for i, probe in enumerate(probes):
            print("Predicting GO for gene: %d, %s (%d out of %d)" %
                  (probe.sequence_id, probe.sequence.name, i, expression_network_method.probe_count))

            # Get neighborhood from database
            neighborhood = json.loads(probe.network)

            # Get sequence ids from genes in first level neighborhood
            sequence_ids = [n['gene_id'] for n in neighborhood if 'gene_id' in n]

            # Get own GO terms
            own_associations = SequenceGOAssociation.query.filter(SequenceGOAssociation.sequence_id == probe.sequence_id)
            own_terms = list(set([a.go_id for a in own_associations]))

            # Get GO terms from neighbors
            associations = SequenceGOAssociation.query.filter(SequenceGOAssociation.sequence_id.in_(sequence_ids)).\
                filter(SequenceGOAssociation.predicted == 0).all()

            # Make GO terms from neighbors unique and ignore terms the current gene has already
            unique_associations = set([(a.sequence_id, a.go_id) for a in associations if a.go_id not in own_terms])
            go_counts = defaultdict(lambda: 0)

            for ua in unique_associations:
                go_counts[ua[1]] += 1

            # find significantly enriched GO terms and store them
            enriched_go = []

            for go_id, count in go_counts.items():
                p_value = hypergeo_sf(count, len(sequence_ids), go_background[go_id], len(probes))
                if p_value < cutoff:
                    enriched_go.append((go_id, p_value))

            # apply FDR correction to the p-values
            corrected_p = fdr_correction([a[1] for a in enriched_go])

            # push new prediction in a dict that will be added to the DB
            for corrected_p, (go_id, p_value) in zip(corrected_p, enriched_go):
                new_associations.append({
                    'sequence_id': probe.sequence_id,
                    'go_id': go_id,
                    'evidence': 'IEP',
                    'source': source,
                    'predicted': True,
                    'prediction_data': json.dumps({'p-cutoff': cutoff,
                                                   'p-value': p_value,
                                                   'p-value (FDR)': corrected_p,
                                                   'network_method': expression_network_method_id,
                                                   'prediction_method': 'Neighborhood enrichment'
                                                   })
                })

        # Add new labels to the database in chuncks of 400
        for i in range(0, len(new_associations), 400):
            db.engine.execute(SequenceGOAssociation.__table__.insert(), new_associations[i: i + 400])

Esempio n. 14

Mostra file

class ExpressionSpecificityMethod(db.Model):
    __tablename__ = 'expression_specificity_method'

    id = db.Column(db.Integer, primary_key=True)
    description = db.Column(db.Text)
    conditions = db.Column(db.Text)
    species_id = db.Column(db.Integer,
                           db.ForeignKey('species.id', ondelete='CASCADE'),
                           index=True)

    specificities = db.relationship('ExpressionSpecificity',
                                    backref='method',
                                    lazy='dynamic',
                                    cascade="all, delete-orphan",
                                    passive_deletes=True)

    condition_tissue = db.relationship('ConditionTissue',
                                       backref='expression_specificity_method',
                                       lazy='joined',
                                       cascade="all, delete-orphan",
                                       passive_deletes=True,
                                       uselist=False)

    menu_order = db.Column(db.Integer)

    def __repr__(self):
        return str(
            self.id) + ". " + self.description + ' [' + self.species.name + ']'

    @staticmethod
    def calculate_specificities(species_id,
                                description,
                                remove_background=False):
        """
        Function that calculates condition specificities for each profile. No grouping is applied, each condition is
        used as is

        :param species_id: internal species ID
        :param description: description for the method to determine the specificity
        :param remove_background: when true the lowest value of each profile is substracted from all values (can be
        off use with noisy data derived from microarrays.
        """

        conditions = []

        # get profile from the database (ORM free for speed)
        profiles = db.engine.execute(
            db.select([
                ExpressionProfile.__table__.c.id,
                ExpressionProfile.__table__.c.profile
            ]).where(ExpressionProfile.__table__.c.species_id ==
                     species_id)).fetchall()

        # detect all conditions
        for profile_id, profile in profiles:
            profile_data = json.loads(profile)
            for condition in profile_data['order']:
                if condition not in conditions:
                    conditions.append(condition)

        # convert list into dictionary and run function
        conditions_dict = {k: k for k in conditions}
        return ExpressionSpecificityMethod.calculate_tissue_specificities(
            species_id,
            description,
            conditions_dict,
            conditions,
            remove_background=remove_background)

    @staticmethod
    def calculate_tissue_specificities(species_id,
                                       description,
                                       condition_to_tissue,
                                       order,
                                       remove_background=False,
                                       use_max=True):
        """
        Function calculates tissue specific genes based on the expression conditions. A dict is required to link
        specific conditions to the correct tissues. This also allows conditions to be excluded in case they are
        unrelated with a specific tissue.


        :param species_id: internal species ID
        :param description: description for the method to determine the specificity
        :param condition_to_tissue: dict to connect a condition to a tissue
        :param order: preferred order of the conditions, will match tissues to it
        :param remove_background: substracts the lowest value to correct for background noise
        :param use_max: uses the maximum of mean values instead of the mean of all values
        :return id of the new method
        """
        new_method = ExpressionSpecificityMethod()
        new_method.species_id = species_id
        new_method.description = description
        new_method.menu_order = 0
        tissues = []
        for c in order:
            if c in condition_to_tissue.keys():
                v = condition_to_tissue[c]
                if v not in tissues:
                    tissues.append(v)

        # get profile from the database (ORM free for speed)
        profiles = db.engine.execute(
            db.select([
                ExpressionProfile.__table__.c.id,
                ExpressionProfile.__table__.c.profile
            ]).where(ExpressionProfile.__table__.c.species_id ==
                     species_id)).fetchall()

        new_method.conditions = json.dumps(tissues)

        db.session.add(new_method)
        db.session.commit()

        # detect specifities and add to the database
        specificities = []

        for profile_id, profile in profiles:
            # prepare profile data for calculation
            profile_data = json.loads(profile)
            profile_means = {}
            for t in tissues:
                values = []
                means = []
                valid_conditions = [
                    k for k in profile_data['data']
                    if k in condition_to_tissue and condition_to_tissue[k] == t
                ]
                for k, v in profile_data['data'].items():
                    if k in valid_conditions:
                        values += v
                        means.append(mean(v))

                if not use_max:
                    profile_means[t] = mean(values) if len(values) > 0 else 0
                else:
                    profile_means[t] = max(means) if len(means) > 0 else 0

            # substract minimum value to remove background
            # experimental code !
            if remove_background:
                minimum = min([v for k, v in profile_means.items()])

                for k in profile_means.keys():
                    profile_means[k] -= minimum

            # determine spm score for each condition
            profile_specificities = []
            profile_tau = tau([v for _, v in profile_means.items()])
            profile_entropy = entropy_from_values(
                [v for _, v in profile_means.items()])

            for t in tissues:
                score = expression_specificity(t, profile_means)
                new_specificity = {
                    'profile_id': profile_id,
                    'condition': t,
                    'score': score,
                    'entropy': profile_entropy,
                    'tau': profile_tau,
                    'method_id': new_method.id,
                }

                profile_specificities.append(new_specificity)

            # sort conditions and add top one
            profile_specificities = sorted(profile_specificities,
                                           key=lambda x: x['score'],
                                           reverse=True)

            specificities.append(profile_specificities[0])

            # write specificities to db if there are more than 400 (ORM free for speed)
            if len(specificities) > 400:
                db.engine.execute(ExpressionSpecificity.__table__.insert(),
                                  specificities)
                specificities = []

        # write remaining specificities to the db
        db.engine.execute(ExpressionSpecificity.__table__.insert(),
                          specificities)
        return new_method.id

Esempio n. 15

Mostra file

class TreeMethod(db.Model):
    __tablename__ = 'tree_methods'
    id = db.Column(db.Integer, primary_key=True)

    description = db.Column(db.Text)

    gene_family_method_id = db.Column(db.Integer,
                                      db.ForeignKey('gene_family_methods.id',
                                                    ondelete='CASCADE'),
                                      index=True)

    trees = db.relationship('Tree',
                            backref=db.backref('method', lazy='joined'),
                            lazy='dynamic',
                            passive_deletes=True)

    def reconcile_trees(self):
        # Fetch required data from the database
        sequences = Sequence.query.all()
        clades = Clade.query.all()

        seq_to_species = {s.name: s.species.code for s in sequences}
        seq_to_id = {s.name: s.id for s in sequences}
        clade_to_species = {c.name: json.loads(c.species) for c in clades}
        clade_to_id = {c.name: c.id for c in clades}

        new_associations = []

        phyloxml_data = {}

        for t in self.trees:
            # Load tree from Newick string and start reconciliating
            tree = newick.loads(t.data_newick)[0]

            for node in tree.walk():
                if len(node.descendants) != 2:
                    if not node.is_binary:
                        # Print warning in case there is a non-binary node
                        print(
                            "[%d, %s] Skipping node... Can only reconcile binary nodes ..."
                            % (tree.id, tree.label))
                    # Otherwise it is a leaf node and can be skipped
                    continue

                branch_one_seq = [
                    l.name.strip() for l in node.descendants[0].get_leaves()
                ]
                branch_two_seq = [
                    l.name.strip() for l in node.descendants[1].get_leaves()
                ]

                branch_one_species = set([
                    seq_to_species[s] for s in branch_one_seq
                    if s in seq_to_species.keys()
                ])
                branch_two_species = set([
                    seq_to_species[s] for s in branch_two_seq
                    if s in seq_to_species.keys()
                ])

                all_species = branch_one_species.union(branch_two_species)

                clade, _ = phylo.get_clade(all_species, clade_to_species)
                duplication = phylo.is_duplication(branch_one_species,
                                                   branch_two_species,
                                                   clade_to_species)

                duplication_consistency = None
                if duplication:
                    duplication_consistency = phylo.duplication_consistency(
                        branch_one_species, branch_two_species)

                tags = [
                    clade_to_id[clade] if clade is not None else 0,
                    'D' if duplication else 'S',
                    duplication_consistency if duplication else 0
                ]

                node.name = '_'.join([str(t) for t in tags])

                if clade is not None:
                    for seq_one in branch_one_seq:
                        for seq_two in branch_two_seq:
                            new_associations.append({
                                'sequence_one_id':
                                seq_to_id[seq_one],
                                'sequence_two_id':
                                seq_to_id[seq_two],
                                'tree_id':
                                t.id,
                                'clade_id':
                                clade_to_id[clade],
                                'duplication':
                                1 if duplication else 0,
                                'duplication_consistency_score':
                                duplication_consistency
                            })
                            new_associations.append({
                                'sequence_one_id':
                                seq_to_id[seq_two],
                                'sequence_two_id':
                                seq_to_id[seq_one],
                                'tree_id':
                                t.id,
                                'clade_id':
                                clade_to_id[clade],
                                'duplication':
                                1 if duplication else 0,
                                'duplication_consistency_score':
                                duplication_consistency
                            })

            if len(new_associations) > 400:
                db.engine.execute(
                    SequenceSequenceCladeAssociation.__table__.insert(),
                    new_associations)
                new_associations = []

            # add newick tree to memory
            phyloxml_data[t.id] = newick.dumps([tree])

        db.engine.execute(SequenceSequenceCladeAssociation.__table__.insert(),
                          new_associations)

        # Update PhyloXML data file for all trees
        for t in self.trees:
            if t.id in phyloxml_data.keys():
                t.data_phyloxml = phyloxml_data[t.id]

        db.session.commit()

Esempio n. 16

Mostra file

class ExpressionProfile(db.Model):
    __tablename__ = 'expression_profiles'
    id = db.Column(db.Integer, primary_key=True)
    species_id = db.Column(db.Integer,
                           db.ForeignKey('species.id', ondelete='CASCADE'),
                           index=True)
    probe = db.Column(db.String(50, collation=SQL_COLLATION), index=True)
    sequence_id = db.Column(db.Integer,
                            db.ForeignKey('sequences.id', ondelete='CASCADE'),
                            index=True)
    profile = db.deferred(db.Column(db.Text))

    specificities = db.relationship('ExpressionSpecificity',
                                    backref=db.backref('profile',
                                                       lazy='joined'),
                                    lazy='dynamic',
                                    cascade="all, delete-orphan",
                                    passive_deletes=True)

    def __init__(self, probe, sequence_id, profile):
        self.probe = probe
        self.sequence_id = sequence_id
        self.profile = profile

    @staticmethod
    def __profile_to_table(data):
        """
        Internal function to convert an expression profile (dict) to a tabular text

        :param data: Dict with expression profile
        :return: table (string)
        """
        output = [["condition", "mean", "min", "max"]]
        order = data["order"]

        for o in order:
            try:
                values = data["data"][o]
                output.append(
                    [o,
                     str(mean(values)),
                     str(min(values)),
                     str(max(values))])
            except Exception as e:
                print(e)

        return '\n'.join(['\t'.join(l) for l in output])

    @property
    def table(self):
        """
        Returns the condition expression as a tabular text file

        :return: table with data (string)
        """
        table = ExpressionProfile.__profile_to_table(json.loads(self.profile))

        return table

    def tissue_table(self, condition_tissue_id, use_means=True):
        """
        Returns the tissue expression as a tabular text file

        :param condition_tissue_id: condition_tissue_id for the conversion
        :param use_means: Use the mean of the condition (recommended)
        :return: table with data (string)
        """
        table = ExpressionProfile.__profile_to_table(
            self.tissue_profile(condition_tissue_id, use_means=use_means))
        return table

    @property
    def low_abundance(self, cutoff=10):
        """
        Checks if the mean expression value in any conditions in the plot is higher than the desired cutoff

        :param cutoff: cutoff for expression, default = 10
        :return: True in case of low abundance otherwise False
        """
        data = json.loads(self.profile)

        checks = [mean(v) > cutoff for _, v in data["data"].items()]

        return not any(checks)

    @staticmethod
    def convert_profile(condition_to_tissue, profile_data, use_means=True):
        """
        Convert a full, detailed profile into a more general summarized one using conversion table stored in the
        database

        :param condition_to_tissue: dict with conversion instructions
        :param profile_data: profile to convert
        :param use_means: use means of detailed condition if True otherwise use samples independently. Default True
        :return: New profile
        """
        tissues = list(set(condition_to_tissue['conversion'].values()))

        output = {}

        for t in tissues:
            valid_conditions = [
                k for k in profile_data['data']
                if k in condition_to_tissue['conversion']
                and condition_to_tissue['conversion'][k] == t
            ]
            valid_values = []
            for k, v in profile_data['data'].items():
                if k in valid_conditions:
                    if use_means:
                        valid_values.append(mean(v))
                    else:
                        valid_values += v

            output[t] = valid_values if len(valid_values) > 0 else [0]

        return {
            'order': condition_to_tissue['order'],
            'colors': condition_to_tissue['colors'],
            'data': output
        }

    def tissue_profile(self, condition_tissue_id, use_means=True):
        """
        Applies a conversion to the profile, grouping several condition into one more general feature (e.g. tissue).

        :param condition_tissue_id: identifier of the conversion table
        :param use_means: store the mean of the condition rather than individual values. The matches the spm
        calculations better.
        :return: parsed profile
        """
        ct = ConditionTissue.query.get(condition_tissue_id)

        condition_to_tissue = json.loads(ct.data)
        profile_data = json.loads(self.profile)

        output = ExpressionProfile.convert_profile(condition_to_tissue,
                                                   profile_data,
                                                   use_means=use_means)

        return output

    @staticmethod
    def get_heatmap(species_id, probes, zlog=True, raw=False):
        """
        Returns a heatmap for a given species (species_id) and a list of probes. It returns a dict with 'order'
        the order of the experiments and 'heatmap' another dict with the actual data. Data is zlog transformed

        :param species_id: species id (internal database id)
        :param probes: a list of probes to include in the heatmap
        :param zlog: enable zlog transformation (otherwise normalization against highest expressed condition)
        """
        profiles = ExpressionProfile.query.options(undefer('profile')).filter_by(species_id=species_id).\
            filter(ExpressionProfile.probe.in_(probes)).all()

        order = []

        output = []

        not_found = [p.lower() for p in probes]

        for profile in profiles:
            name = profile.probe
            data = json.loads(profile.profile)
            order = data['order']
            experiments = data['data']

            with contextlib.suppress(ValueError):
                not_found.remove(profile.probe.lower())

            with contextlib.suppress(ValueError):
                not_found.remove(profile.sequence.name.lower())

            values = {}

            for o in order:
                values[o] = mean(experiments[o])

            row_mean = mean(values.values())
            row_max = max(values.values())

            for o in order:
                if zlog:
                    if row_mean == 0 or values[o] == 0:
                        values[o] = '-'
                    else:
                        try:
                            values[o] = log(values[o] / row_mean, 2)
                        except ValueError as _:
                            print("Unable to calculate log()", values[o],
                                  row_mean)
                            values[o] = '-'
                else:
                    if row_max != 0 and not raw:
                        values[o] = values[o] / row_max

            output.append({
                "name": name,
                "values": values,
                "sequence_id": profile.sequence_id,
                "shortest_alias": profile.sequence.shortest_alias
            })

        if len(not_found) > 0:
            flash("Couldn't find profile for: %s" % ", ".join(not_found),
                  "warning")

        return {'order': order, 'heatmap_data': output}

    @staticmethod
    def get_profiles(species_id, probes, limit=1000):
        """
        Gets the data for a set of probes (including the full profiles), a limit can be provided to avoid overly
        long queries

        :param species_id: internal id of the species
        :param probes: probe names to fetch
        :param limit: maximum number of probes to get
        :return: List of ExpressionProfile objects including the full profiles
        """
        profiles = ExpressionProfile.query.\
            options(undefer('profile')).\
            filter(ExpressionProfile.probe.in_(probes)).\
            filter_by(species_id=species_id).\
            options(joinedload('sequence').load_only('name').noload('xrefs')).\
            limit(limit).all()

        return profiles

    @staticmethod
    def add_profile_from_lstrap(matrix_file,
                                annotation_file,
                                species_id,
                                order_color_file=None):
        """
        Function to convert an (normalized) expression matrix (lstrap output) into a profile

        :param matrix_file: path to the expression matrix
        :param annotation_file: path to the file assigning samples to conditions
        :param species_id: internal id of the species
        :param order_color_file: tab delimited file that contains the order and color of conditions
        """
        annotation = {}

        with open(annotation_file, 'r') as fin:
            # get rid of the header
            _ = fin.readline()

            for line in fin:
                parts = line.strip().split('\t')
                if len(parts) > 1:
                    run, description = parts
                    annotation[run] = description

        order, colors = [], []
        if order_color_file is not None:
            with open(order_color_file, 'r') as fin:
                for line in fin:
                    try:
                        o, c = line.strip().split('\t')
                        order.append(o)
                        colors.append(c)
                    except Exception as _:
                        pass

        # build conversion table for sequences
        sequences = Sequence.query.filter_by(species_id=species_id).all()

        sequence_dict = {}  # key = sequence name uppercase, value internal id
        for s in sequences:
            sequence_dict[s.name.upper()] = s.id

        with open(matrix_file) as fin:
            # read header
            _, *colnames = fin.readline().rstrip().split()

            colnames = [c.replace('.htseq', '') for c in colnames]

            # determine order after annotation is not defined
            if order is None:
                order = []

                for c in colnames:
                    if c in annotation.keys():
                        if annotation[c] not in order:
                            order.append(annotation[c])

                order.sort()

            # read each line and build profile
            new_probes = []
            for line in fin:
                transcript, *values = line.rstrip().split()
                profile = defaultdict(list)

                for c, v in zip(colnames, values):
                    if c in annotation.keys():
                        condition = annotation[c]
                        profile[condition].append(float(v))

                new_probe = {
                    "species_id":
                    species_id,
                    "probe":
                    transcript,
                    "sequence_id":
                    sequence_dict[transcript.upper()]
                    if transcript.upper() in sequence_dict.keys() else None,
                    "profile":
                    json.dumps({
                        "order": order,
                        "colors": colors,
                        "data": profile
                    })
                }

                new_probes.append(new_probe)

                if len(new_probes) > 400:
                    db.engine.execute(ExpressionProfile.__table__.insert(),
                                      new_probes)
                    new_probes = []

            db.engine.execute(ExpressionProfile.__table__.insert(), new_probes)

Esempio n. 17

Mostra file

class Sequence(db.Model):
    __tablename__ = 'sequences'
    id = db.Column(db.Integer, primary_key=True)
    species_id = db.Column(db.Integer,
                           db.ForeignKey('species.id', ondelete='CASCADE'),
                           index=True)
    name = db.Column(db.String(50, collation=SQL_COLLATION), index=True)
    description = db.Column(db.Text)
    coding_sequence = db.deferred(db.Column(db.Text))
    type = db.Column(db.Enum('protein_coding',
                             'TE',
                             'RNA',
                             name='sequence_type'),
                     default='protein_coding')
    is_mitochondrial = db.Column(db.SmallInteger, default=False)
    is_chloroplast = db.Column(db.SmallInteger, default=False)

    expression_profiles = db.relationship('ExpressionProfile',
                                          backref=db.backref('sequence',
                                                             lazy='joined'),
                                          lazy='dynamic',
                                          cascade="all, delete-orphan",
                                          passive_deletes=True)
    network_nodes = db.relationship('ExpressionNetwork',
                                    backref=db.backref('sequence',
                                                       lazy='joined'),
                                    lazy='dynamic',
                                    cascade="all, delete-orphan",
                                    passive_deletes=True)

    # Other properties
    #
    # coexpression_cluster_associations declared in 'SequenceCoexpressionClusterAssociation'
    # interpro_associations declared in 'SequenceInterproAssociation'
    # go_associations declared in 'SequenceGOAssociation'
    # family_associations declared in 'SequenceFamilyAssociation'

    go_labels = db.relationship('GO', secondary=sequence_go, lazy='dynamic')
    interpro_domains = db.relationship('Interpro',
                                       secondary=sequence_interpro,
                                       lazy='dynamic')
    families = db.relationship('GeneFamily',
                               secondary=sequence_family,
                               lazy='dynamic')

    coexpression_clusters = db.relationship(
        'CoexpressionCluster',
        secondary=sequence_coexpression_cluster,
        backref=db.backref('sequences', lazy='dynamic'),
        lazy='dynamic')

    ecc_query_associations = db.relationship(
        'SequenceSequenceECCAssociation',
        primaryjoin="SequenceSequenceECCAssociation.query_id == Sequence.id",
        backref=db.backref('query_sequence', lazy='joined'),
        lazy='dynamic')

    ecc_target_associations = db.relationship(
        'SequenceSequenceECCAssociation',
        primaryjoin="SequenceSequenceECCAssociation.target_id == Sequence.id",
        backref=db.backref('target_sequence', lazy='joined'),
        lazy='dynamic')

    clade_associations_one = db.relationship(
        'SequenceSequenceCladeAssociation',
        primaryjoin=
        "SequenceSequenceCladeAssociation.sequence_one_id == Sequence.id",
        backref=db.backref('sequence_one', lazy='joined'),
        lazy='dynamic')

    clade_associations_two = db.relationship(
        'SequenceSequenceCladeAssociation',
        primaryjoin=
        "SequenceSequenceCladeAssociation.sequence_two_id == Sequence.id",
        backref=db.backref('sequence_two', lazy='joined'),
        lazy='dynamic')

    xrefs = db.relationship('XRef', secondary=sequence_xref, lazy='joined')

    def __init__(self,
                 species_id,
                 name,
                 coding_sequence,
                 type='protein_coding',
                 is_chloroplast=False,
                 is_mitochondrial=False,
                 description=None):
        self.species_id = species_id
        self.name = name
        self.description = description
        self.coding_sequence = coding_sequence
        self.type = type
        self.is_chloroplast = is_chloroplast
        self.is_mitochondrial = is_mitochondrial

    @property
    def protein_sequence(self):
        """
        Function to translate the coding sequence to the amino acid sequence. Will start at the first start codon and
        break after adding a stop codon (indicated by '*')

        :return: The amino acid sequence based on the coding sequence
        """
        return translate(self.coding_sequence)

    @property
    def aliases(self):
        """
        Returns a readable string with the aliases or tokens stored for this sequence in the table xrefs

        :return: human readable string with aliases or None
        """
        t = [x.name for x in self.xrefs if x.platform == 'token']

        return ", ".join(t) if len(t) > 0 else None

    @property
    def shortest_alias(self):
        """
        Returns the shortest alias

        :return: string with shortest alias or None (in case no aliases exist)
        """
        t = [x.name for x in self.xrefs if x.platform == 'token']

        return min(t, key=len) if len(t) > 0 else None

    @property
    def display_name(self):
        """
        Returns a name to display (from xrefs with display) if available otherwise return name

        :return: display name
        """
        t = [x.name for x in self.xrefs if x.platform == 'display']

        return t[0] if len(t) > 0 else self.name

    @property
    def best_name(self):
        """
        Checks if there is a display name, if not checks the shortest alias, otherwise returns name. To be used in e.g.
        graphs

        :return: string with best name to show in graphs, ...
        """
        if self.display_name is not self.name:
            return self.display_name
        elif self.shortest_alias is not None:
            return self.shortest_alias
        else:
            return self.name

    @property
    def readable_type(self):
        """
        Converts the type table to a readable string

        :return: string with readable version of the sequence type
        """
        conversion = {
            'protein_coding': 'protein coding',
            'TE': 'transposable element',
            'RNA': 'RNA'
        }

        if self.type in conversion.keys():
            return conversion[self.type]
        else:
            return 'other'

    @staticmethod
    def add_from_fasta(filename, species_id, compressed=False):
        fasta_data = Fasta()
        fasta_data.readfile(filename, compressed=compressed)

        new_sequences = []

        # Loop over sequences, sorted by name (key here) and add to db
        for name, sequence in sorted(fasta_data.sequences.items(),
                                     key=operator.itemgetter(0)):
            new_sequence = {
                "species_id": species_id,
                "name": name,
                "description": None,
                "coding_sequence": sequence,
                "type": "protein_coding",
                "is_mitochondrial": False,
                "is_chloroplast": False
            }

            new_sequences.append(new_sequence)

            # add 400 sequences at the time, more can cause problems with some database engines
            if len(new_sequences) > 400:
                db.engine.execute(Sequence.__table__.insert(), new_sequences)
                new_sequences = []

        # add the last set of sequences
        db.engine.execute(Sequence.__table__.insert(), new_sequences)

        return len(fasta_data.sequences.keys())

    @staticmethod
    def add_descriptions(filename, species_id):
        sequences = Sequence.query.filter_by(species_id=species_id).all()

        seq_dict = {}

        for s in sequences:
            seq_dict[s.name] = s

        with open(filename, "r") as f_in:
            for i, line in enumerate(f_in):
                try:
                    name, description = line.strip().split('\t')
                except ValueError:
                    print("Cannot parse line %d: \"%s\"" % (i, line),
                          file=sys.stderr)
                finally:
                    if name in seq_dict.keys():
                        seq_dict[name].description = description

                if i % 400 == 0:
                    db.session.commit()

            db.session.commit()

    @staticmethod
    def export_cds(filename):
        sequences = Sequence.query.options(undefer('coding_sequence')).all()

        with open(filename, "w") as f_out:
            for s in sequences:
                print(">%s\n%s" % (s.name, s.coding_sequence), file=f_out)

    @staticmethod
    def export_protein(filename):
        sequences = Sequence.query.options(undefer('coding_sequence')).all()

        with open(filename, "w") as f_out:
            for s in sequences:
                print(">%s\n%s" % (s.name, s.protein_sequence), file=f_out)

Esempio n. 18

Mostra file

class Interpro(db.Model):
    __tablename__ = 'interpro'
    id = db.Column(db.Integer, primary_key=True)
    label = db.Column(db.String(50, collation=SQL_COLLATION), unique=True, index=True)
    description = db.Column(db.Text)

    clade_id = db.Column(db.Integer, db.ForeignKey('clades.id', ondelete='SET NULL'), index=True)

    sequences = db.relationship('Sequence', secondary=sequence_interpro, lazy='dynamic')

    # Other properties
    # sequence_associations = defined in SequenceInterproRelationship

    def __init__(self, label, description):
        self.label = label
        self.description = description

    @property
    def species_codes(self):
        """
        Finds all species the family has genes from
        :return: a list of all species (codes)
        """

        sequences = self.sequences.options(joinedload('species')).all()

        output = []

        for s in sequences:
            if s.species.code not in output:
                output.append(s.species.code)

        return output

    @property
    def species_counts(self):
        """
        Generates a phylogenetic profile of a gene family
        :return: a dict with counts per species (codes are keys)
        """

        sequences = self.sequences.options(joinedload('species')).all()

        output = {}

        for s in sequences:
            if s.species.code not in output:
                output[s.species.code] = 1
            else:
                output[s.species.code] += 1

        return output

    @staticmethod
    def sequence_stats(sequence_ids):
        """
        Takes a list of sequence IDs and returns InterPro stats for those sequences

        :param sequence_ids: list of sequence ids
        :return: dict with for each InterPro domain linked with any of the input sequences stats
        """
        data = SequenceInterproAssociation.query.filter(SequenceInterproAssociation.sequence_id.in_(sequence_ids)).all()

        return Interpro.__sequence_stats_associations(data)

    @staticmethod
    def sequence_stats_subquery(sequences):
        subquery = sequences.subquery()
        data = SequenceInterproAssociation.query.join(subquery, SequenceInterproAssociation.sequence_id == subquery.c.id).all()

        return Interpro.__sequence_stats_associations(data)

    @staticmethod
    def __sequence_stats_associations(associations):
        output = {}

        for d in associations:
            if d.interpro_id not in output.keys():
                output[d.interpro_id] = {
                    'domain': d.domain,
                    'count': 1,
                    'sequences': [d.sequence_id],
                    'species': [d.sequence.species_id]
                }
            else:
                output[d.interpro_id]['count'] += 1
                if d.sequence_id not in output[d.interpro_id]['sequences']:
                    output[d.interpro_id]['sequences'].append(d.sequence_id)
                if d.sequence.species_id not in output[d.interpro_id]['species']:
                    output[d.interpro_id]['species'].append(d.sequence.species_id)

        for k, v in output.items():
            v['species_count'] = len(v['species'])
            v['sequence_count'] = len(v['sequences'])

        return output

    @property
    def interpro_stats(self):
        sequence_ids = [s.id for s in self.sequences.all()]

        return Interpro.sequence_stats_subquery(self.sequences)

    @property
    def go_stats(self):
        from conekt.models.go import GO

        return GO.sequence_stats_subquery(self.sequences)

    @property
    def family_stats(self):
        from conekt.models.gene_families import GeneFamily

        return GeneFamily.sequence_stats_subquery(self.sequences)

    @staticmethod
    def add_from_xml(filename, empty=True):
        """
        Populates interpro table with domains and descriptions from the official website's XML file

        :param filename: path to XML file
        :param empty: If True the interpro table will be cleared before uploading the new domains, default = True
        """
        # If required empty the table first
        if empty:
            try:
                db.session.query(Interpro).delete()
                db.session.commit()
            except Exception as e:
                db.session.rollback()
                print(e)

        interpro_parser = InterproParser()

        interpro_parser.readfile(filename)

        for i, domain in enumerate(interpro_parser.domains):
            interpro = Interpro(domain.label, domain.description)

            db.session.add(interpro)

            if i % 40 == 0:
                # commit to the db frequently to allow WHOOSHEE's indexing function to work without timing out
                try:
                    db.session.commit()
                except Exception as e:
                    db.session.rollback()
                    print(e)

        try:
            db.session.commit()
        except Exception as e:
            db.session.rollback()
            print(e)

    @staticmethod
    def add_interpro_from_plaza(filename):
        """
        Adds GO annotation from PLAZA 3.0 to the database

        :param filename: Path to the annotation file
        :return:
        """
        interpro_parser = InterproDomainParser()

        interpro_parser.read_plaza_interpro(filename)

        gene_hash = {}
        domain_hash = {}

        all_sequences = Sequence.query.all()
        all_domains = Interpro.query.all()

        for sequence in all_sequences:
            gene_hash[sequence.name] = sequence

        for domain in all_domains:
            domain_hash[domain.label] = domain

        new_domains = []

        for gene, domains in interpro_parser.annotation.items():
            if gene in gene_hash.keys():
                current_sequence = gene_hash[gene]
                for domain in domains:
                    if domain["id"] in domain_hash.keys():
                        current_domain = domain_hash[domain["id"]]

                        new_domain = {"sequence_id": current_sequence.id,
                                      "interpro_id": current_domain.id,
                                      "start": domain["start"],
                                      "stop": domain["stop"]}

                        new_domains.append(new_domain)

                    else:
                        print(domain["id"], "not found in the database.")
            else:
                print("Gene", gene, "not found in the database.")

            if len(new_domains) > 400:
                db.engine.execute(SequenceInterproAssociation.__table__.insert(), new_domains)
                new_domains = []

        db.engine.execute(SequenceInterproAssociation.__table__.insert(), new_domains)

    @staticmethod
    def add_interpro_from_interproscan(filename, species_id):
        """
        Adds GO annotation from InterProScan Output

        :param filename: Path to the annotation file
        :return:
        """
        interpro_parser = InterproDomainParser()

        interpro_parser.read_interproscan(filename)

        gene_hash = {}
        domain_hash = {}

        all_sequences = Sequence.query.filter_by(species_id=species_id)
        all_domains = Interpro.query.all()

        for sequence in all_sequences:
            gene_hash[sequence.name] = sequence

        for domain in all_domains:
            domain_hash[domain.label] = domain

        new_domains = []

        for gene, domains in interpro_parser.annotation.items():
            if gene in gene_hash.keys():
                current_sequence = gene_hash[gene]
                for domain in domains:
                    if domain["id"] in domain_hash.keys():
                        current_domain = domain_hash[domain["id"]]

                        new_domain = {"sequence_id": current_sequence.id,
                                      "interpro_id": current_domain.id,
                                      "start": domain["start"],
                                      "stop": domain["stop"]}

                        new_domains.append(new_domain)

                    else:
                        print(domain["id"], "not found in the database.")
            else:
                print("Gene", gene, "not found in the database.")

            if len(new_domains) > 400:
                db.engine.execute(SequenceInterproAssociation.__table__.insert(), new_domains)
                new_domains = []

        db.engine.execute(SequenceInterproAssociation.__table__.insert(), new_domains)

Esempio n. 19

Mostra file

class Clade(db.Model):
    __tablename__ = 'clades'
    id = db.Column(db.Integer, primary_key=True)
    name = db.Column(db.String(50, collation=SQL_COLLATION),
                     unique=True,
                     index=True)
    species = db.Column(db.Text(collation=SQL_COLLATION))
    species_count = db.Column(db.Integer)
    newick_tree = db.Column(db.Text)

    families = db.relationship('GeneFamily', backref='clade', lazy='dynamic')
    interpro = db.relationship('Interpro', backref='clade', lazy='dynamic')

    def __init__(self, name, species, tree):
        self.name = name
        self.species = json.dumps(species)
        self.species_count = len(species)
        self.newick_tree = tree

    def __repr__(self):
        return str(self.id) + ". " + self.name

    @staticmethod
    def add_clade(name, species, tree):
        """
        Add a clade to the database

        :param name: name of the clade
        :param species: list with codes (!) of the species in the clade
        :param tree: newick tree for this clade. Will be stored in the database and used for visualizations
        """
        new_clade = Clade(name, species, tree)
        db.session.add(new_clade)
        try:
            db.session.commit()
        except Exception as e:
            db.session.rollback()
            print(e)

    @staticmethod
    def add_clades_from_json(data):
        """
        Adds a clade from a dict with clade details

        :param data: dict with clade details
        """
        for c, data in data.items():
            Clade.add_clade(c, data['species'], data['tree'])

    @staticmethod
    def update_clades():
        """
        Loop over all families and determine what clade they belong too. Results are stored in the database
        """
        clades = Clade.query.all()
        families = GeneFamily.query.all()

        clade_to_species = {c.name: json.loads(c.species) for c in clades}
        clade_to_id = {c.name: c.id for c in clades}

        for f in families:
            family_species = f.species_codes

            # skip for families without members
            if len(family_species) == 0:
                f.clade_id = None
                continue

            # find the clade with the fewest species that contains all the codes
            selected_clade, _ = get_clade(family_species, clade_to_species)
            if selected_clade is None:
                f.clade_id = None
            else:
                f.clade_id = clade_to_id[selected_clade]

        try:
            db.session.commit()
        except Exception as e:
            db.session.rollback()
            print(e)

    @staticmethod
    def update_clades_interpro():
        """
        Loop over all families and determine what clade they belong too
        """
        clades = Clade.query.all()
        interpro = Interpro.query.all()

        clade_to_species = {c.name: json.loads(c.species) for c in clades}
        clade_to_id = {c.name: c.id for c in clades}

        for i in interpro:
            interpro_species = i.species_codes

            # skip for families without members
            if len(interpro_species) == 0:
                i.clade_id = None
                continue

            # find the clade with the fewest species that contains all the codes
            selected_clade, _ = get_clade(interpro_species, clade_to_species)
            if selected_clade is None:
                i.clade_id = None
            else:
                i.clade_id = clade_to_id[selected_clade]

        try:
            db.session.commit()
        except Exception as e:
            db.session.rollback()
            print(e)

    @property
    def newick_tree_species(self):
        """
        Returns a Newick tree with the species present in the current clade.

        :return: Newick tree (string) with species for the current clade
        """
        species = {s.code: s.name for s in Species.query.all()}

        tree = newick.loads(self.newick_tree)[0]

        for code, name in species.items():
            node = tree.get_node(code)
            if node is not None:
                node.name = name

        return newick.dumps([tree])

Esempio n. 20

Mostra file

class SequenceSequenceECCAssociation(db.Model):
    __tablename__ = 'sequence_sequence_ecc'
    __table_args__ = {'extend_existing': True}

    id = db.Column(db.Integer, primary_key=True)

    query_id = db.Column(db.Integer,
                         db.ForeignKey('sequences.id', ondelete='CASCADE'))
    target_id = db.Column(db.Integer,
                          db.ForeignKey('sequences.id', ondelete='CASCADE'))

    ecc = db.Column(db.Float)
    p_value = db.Column(db.Float)
    corrected_p_value = db.Column(db.Float)

    gene_family_method_id = db.Column(
        db.Integer, db.ForeignKey('gene_family_methods.id',
                                  ondelete='CASCADE'))
    query_network_method_id = db.Column(
        db.Integer,
        db.ForeignKey('expression_network_methods.id', ondelete='CASCADE'))
    target_network_method_id = db.Column(
        db.Integer,
        db.ForeignKey('expression_network_methods.id', ondelete='CASCADE'))

    gene_family_method = db.relationship('GeneFamilyMethod',
                                         lazy='joined',
                                         backref=db.backref(
                                             'ecc_as_family_method',
                                             lazy='dynamic',
                                             passive_deletes=True))

    query_expression_network_method = db.relationship(
        'ExpressionNetworkMethod',
        foreign_keys=[query_network_method_id],
        lazy='joined',
        backref=db.backref('ecc_as_query_method',
                           lazy='dynamic',
                           passive_deletes=True))
    target_expression_network_method = db.relationship(
        'ExpressionNetworkMethod',
        foreign_keys=[target_network_method_id],
        lazy='joined',
        backref=db.backref('ecc_as_target_method',
                           lazy='dynamic',
                           passive_deletes=True))

    @staticmethod
    def get_ecc_network(sequence, network, family):
        """
        Get network connecting a specific sequence to all genes with significant Expression Context Conservation.


        :param sequence: internal ID of sequence
        :param network: network method ID to consider
        :param family: kind of gene families used to detect ECC
        :return: network dict (can be made compatible using CytoscapeHelper)
        """
        data = SequenceSequenceECCAssociation.query.filter(
            and_(
                SequenceSequenceECCAssociation.query_id == sequence,
                SequenceSequenceECCAssociation.query_network_method_id ==
                network, SequenceSequenceECCAssociation.gene_family_method_id
                == family)).all()

        # return an empty dict in case there are no hits for this query
        if len(data) < 1:
            return {'nodes': [], 'edges': []}

        # add the query node
        d = data[0]
        nodes = [{
            "id": d.query_sequence.name,
            "name": d.query_sequence.name,
            "species_id": d.query_sequence.species_id,
            "species_name": d.query_sequence.species.name,
            "gene_id": d.query_id,
            "gene_name": d.query_sequence.name,
            "network_method_id": network,
            "node_type": "query"
        }]
        edges = []

        networks = {}

        for d in data:
            nodes.append({
                "id": d.target_sequence.name,
                "name": d.target_sequence.name,
                "species_id": d.target_sequence.species_id,
                "species_name": d.target_sequence.species.name,
                "gene_id": d.target_id,
                "network_method_id": d.target_network_method_id,
                "gene_name": d.target_sequence.name
            })

            if d.target_network_method_id not in networks.keys():
                networks[d.target_network_method_id] = []
            networks[d.target_network_method_id].append(d.target_id)

            # TODO: add p-value and corrected p once implemented
            edges.append({
                "source": d.query_sequence.name,
                "target": d.target_sequence.name,
                "ecc_score": d.ecc,
                "edge_type": 0
            })

        for n, sequences in networks.items():
            new_data = SequenceSequenceECCAssociation.query.filter(
                and_(
                    SequenceSequenceECCAssociation.query_id.in_(sequences),
                    SequenceSequenceECCAssociation.target_id.in_(sequences),
                    SequenceSequenceECCAssociation.target_network_method_id ==
                    n, SequenceSequenceECCAssociation.query_network_method_id
                    == n, SequenceSequenceECCAssociation.gene_family_method_id
                    == family, SequenceSequenceECCAssociation.query_id !=
                    SequenceSequenceECCAssociation.target_id)).all()

            for nd in new_data:
                # TODO: add p-value and corrected p once implemented
                # make sure the connection doesn't exist already
                if not any(d['source'] == nd.target_sequence.name
                           and d['target'] == nd.query_sequence.name
                           for d in edges):
                    edges.append({
                        "source": nd.query_sequence.name,
                        "target": nd.target_sequence.name,
                        "ecc_score": nd.ecc,
                        "edge_type": 1
                    })

        return {"nodes": nodes, "edges": edges}

    @staticmethod
    def get_ecc_pair_network(ecc_id):
        """
        Get all data for an SequenceSequenceECCAssociation to make a ECC graph, similar to the pairwise comparisons in
        Movahedi et al.

        :param ecc_id: interal id of the SequenceSequenceECCAssociation
        :return: ecc pair with neighborhood as graph dict
        """

        association = SequenceSequenceECCAssociation.query.get_or_404(ecc_id)

        nodes = [
            {
                "id": association.query_sequence.name,
                "name": association.query_sequence.name,
                "species_id": association.query_sequence.species_id,
                "species_name": association.query_sequence.species.name,
                "gene_id": association.query_id,
                "gene_name": association.query_sequence.name,
                "network_method_id": association.query_network_method_id,
                "node_type": "query"
            },
            {
                "id": association.target_sequence.name,
                "name": association.target_sequence.name,
                "species_id": association.target_sequence.species_id,
                "species_name": association.target_sequence.species.name,
                "gene_id": association.target_id,
                "gene_name": association.target_sequence.name,
                "network_method_id": association.target_network_method_id,
                "node_type": "query"
            },
        ]

        edges = [{
            "source": association.query_sequence.name,
            "target": association.target_sequence.name,
            "ecc_score": association.ecc,
            'ecc_pair_color': "#D33",
            "edge_type": "ecc"
        }]

        query_network = association.query_sequence.network_nodes.filter_by(
            method_id=association.query_network_method_id).first_or_404(
            ).network
        target_network = association.target_sequence.network_nodes.filter_by(
            method_id=association.target_network_method_id).first_or_404(
            ).network

        query_network_data = json.loads(query_network)
        target_network_data = json.loads(target_network)

        sequences = [
            association.query_sequence.id, association.target_sequence.id
        ]

        for n in query_network_data:
            gene_id = n['gene_id'] if 'gene_id' in n.keys() else None
            gene_name = n['gene_name'] if 'gene_name' in n.keys() else None

            if gene_id not in sequences:
                nodes.append({
                    "id":
                    gene_name,
                    "name":
                    gene_name,
                    "species_id":
                    association.query_sequence.species_id,
                    "species_name":
                    association.query_sequence.species.name,
                    "gene_id":
                    gene_id,
                    "gene_name":
                    gene_name,
                    "network_method_id":
                    association.query_network_method_id,
                    "node_type":
                    "target"
                })
                sequences.append(gene_id)

            edges.append({
                "source":
                association.query_sequence.name,
                "target":
                gene_name,
                "link_score":
                n['link_score'] if 'link_score' in n else 0,
                "edge_type":
                "expression",
                'ecc_pair_color':
                "#3D3"
            })

        for n in target_network_data:
            gene_id = n['gene_id'] if 'gene_id' in n.keys() else None
            gene_name = n['gene_name'] if 'gene_name' in n.keys() else None

            if gene_id not in sequences:
                sequences.append(gene_id)
                nodes.append({
                    "id":
                    gene_name,
                    "name":
                    gene_name,
                    "species_id":
                    association.target_sequence.species_id,
                    "species_name":
                    association.target_sequence.species.name,
                    "gene_id":
                    gene_id,
                    "gene_name":
                    gene_name,
                    "network_method_id":
                    association.target_network_method_id,
                    "node_type":
                    "target"
                })

            edges.append({
                "source":
                association.target_sequence.name,
                "target":
                gene_name,
                "link_score":
                n['link_score'] if 'link_score' in n else 0,
                "edge_type":
                "expression",
                'ecc_pair_color':
                "#3D3"
            })

        return {
            "nodes": nodes,
            "edges": edges
        }, association.gene_family_method_id

    @staticmethod
    def get_ecc_multi_network(gf_method_id, sequence_ids):
        """
        Creates an ECC network for multiple genes, the resulting network will contain all ECC partners of the input
        genes. Pruning this network keeping only genes with non-unique label co-occurances is recommended !


        :param gf_method_id: gene family method used to detect ECC
        :param sequence_ids: sequences to include as the core of the network
        :return: network dict
        """
        associations = SequenceSequenceECCAssociation.query.\
            filter(SequenceSequenceECCAssociation.gene_family_method_id == gf_method_id).\
            filter(and_(SequenceSequenceECCAssociation.query_id.in_(sequence_ids),
                        SequenceSequenceECCAssociation.target_id.in_(sequence_ids))).\
            all()

        nodes, edges = [], []
        node_sequence_ids = []

        networks = []

        for a in associations:
            query_network = a.query_sequence.network_nodes.filter_by(
                method_id=a.query_network_method_id).first_or_404().network
            target_network = a.target_sequence.network_nodes.filter_by(
                method_id=a.target_network_method_id).first_or_404().network

            if query_network not in networks:
                networks.append((a.query_id, a.query_sequence.name,
                                 a.query_sequence.species_id,
                                 a.query_sequence.species.name,
                                 a.query_network_method_id, query_network))
            if target_network not in networks:
                networks.append((a.target_id, a.target_sequence.name,
                                 a.target_sequence.species_id,
                                 a.target_sequence.species.name,
                                 a.target_network_method_id, target_network))

            if a.query_id not in node_sequence_ids:
                node_sequence_ids.append(a.query_id)
                nodes.append({
                    "id": a.query_sequence.name,
                    "name": a.query_sequence.name,
                    "species_id": a.query_sequence.species_id,
                    "species_name": a.query_sequence.species.name,
                    "gene_id": a.query_id,
                    "gene_name": a.query_sequence.name,
                    "network_method_id": a.query_network_method_id,
                    "node_type": "query"
                })

            if a.target_id not in node_sequence_ids:
                node_sequence_ids.append(a.target_id)
                nodes.append({
                    "id": a.target_sequence.name,
                    "name": a.target_sequence.name,
                    "species_id": a.target_sequence.species_id,
                    "species_name": a.target_sequence.species.name,
                    "gene_id": a.target_id,
                    "gene_name": a.target_sequence.name,
                    "network_method_id": a.target_network_method_id,
                    "node_type": "query"
                })

            edges.append({
                "source": a.query_sequence.name,
                "target": a.target_sequence.name,
                "ecc_score": a.ecc,
                'ecc_pair_color': "#D33",
                "edge_type": "ecc"
            })

        new_edges = []

        for sequence_id, sequence_name, species_id, species_name, network_method_id, n in networks:
            network_data = json.loads(n)
            for node in network_data:
                gene_id = node['gene_id'] if 'gene_id' in node.keys() else None
                gene_name = node['gene_name'] if 'gene_name' in node.keys(
                ) else None

                if gene_id not in node_sequence_ids:
                    node_sequence_ids.append(gene_id)
                    nodes.append({
                        "id": gene_name,
                        "name": gene_name,
                        "species_id": species_id,
                        "species_name": species_name,
                        "gene_id": gene_id,
                        "gene_name": gene_name,
                        "network_method_id": network_method_id,
                        "node_type": "target"
                    })

                if (sequence_name, gene_name) not in new_edges:
                    new_edges.append((sequence_name, gene_name))
                    new_edges.append((gene_name, sequence_name))

                    edges.append({
                        "source":
                        sequence_name,
                        "target":
                        gene_name,
                        "link_score":
                        node['link_score'] if 'link_score' in node else 0,
                        "edge_type":
                        "expression",
                        'ecc_pair_color':
                        "#3D3"
                    })

        return {"nodes": nodes, "edges": edges}, gf_method_id