def setUpClass(cls):
        try:
            path = os.environ['PYOMA_DB2CHECK']
        except KeyError:
            raise unittest.SkipTest("No database specified in PYOMA_DB2CHECK")

        cls.db = pyomadb.Database(path)
Example #2
0
    def _load_ogs(self):
        """
        Using the orthoxml file select only the OGs of interest that have more species than the min_species threshold
        :return: Dictionary with og name as key and list of SeqRecords
        """

        if '.fa' in self.args.dna_reference or '.fasta' in self.args.dna_reference:
            print('--- Load ogs and find their corresponding DNA seq from {} ---'.format(self.args.dna_reference))
            print(
                'Loading {} into memory. This might take a while . . . '.format(self.args.dna_reference.split("/")[-1]))
            self._db = SeqIO.index(self.args.dna_reference, "fasta")
            self._db_source = 'fa'
        elif '.h5' in self.args.dna_reference:
            print('--- Load ogs and find their corresponding DNA seq from {} ---'.format(self.args.dna_reference))
            self._db = db.Database(self.args.dna_reference)
            self._db_id_map = db.OmaIdMapper(self._db)
            self._db_source = 'h5'
        else:
            print('--- Load ogs and find their corresponding DNA seq using the REST api ---')
            self._db_source = 'REST_api'

        if self.oma.mode is 'standalone':
            self._og_orthoxml = os.path.join(self.oma_output_path, 'OrthologousGroups.orthoxml')
            self._tree_str = os.path.join(self.oma_output_path, 'EstimatedSpeciesTree.nwk')
            self._ham_analysis = pyham.Ham(self._tree_str, self._og_orthoxml, use_internal_name=False)

        ogs = {}

        orthologous_groups_aa = os.path.join(self.args.output_path, "01_ref_ogs_aa")
        if not os.path.exists(orthologous_groups_aa):
            os.makedirs(orthologous_groups_aa)

        orthologous_groups_dna = os.path.join(self.args.output_path, "01_ref_ogs_dna")
        if not os.path.exists(orthologous_groups_dna):
            os.makedirs(orthologous_groups_dna)

        names_og = self.ogs

        for name, records in tqdm(names_og.items(), desc='Loading OGs', unit=' OGs'):
            # name = file.split("/")[-1].split(".")[0]
            ogs[name] = OG()
            ogs[name].aa = self._get_aa_records(name, records)
            output_file_aa = os.path.join(orthologous_groups_aa, name + ".fa")
            output_file_dna = os.path.join(orthologous_groups_dna, name + ".fa")

            if self._db_source:
                ogs[name].dna = self._get_dna_records(ogs[name].aa, name)
            else:
                print("DNA reference was not provided. Only amino acid sequences gathered!")
            self._write(output_file_dna, ogs[name].dna)
            self._write(output_file_aa, ogs[name].aa)

        return ogs
    def __init__(self,
                 go_file,
                 go_terms,
                 gaf,
                 omadb=None,
                 tarfile_ortho=None,
                 TermCountsFile=None):
        self.go_file = go_file

        if omadb:
            print('open oma db obj')
            from pyoma.browser import db
            h5_oma = open_file(omadb, mode="r")
            self.db_obj = db.Database(h5_oma)
            print('done')
        elif tarfile_ortho:
            #retrieve hog members from tarfile_ortho
            self.tar = tarfile.open(tarfile_ortho, "r:gz")
        else:
            raise Exception('please provide input dataset')

        #go_terms_hdf5 = h5py.File(go_terms, mode='r')
        #self.goterms2parents = go_terms_hdf5['goterms2parents']
        self.godf = pickle.loads(open(go_terms, 'rb').read())
        self.go_file = obo_parser.GODag(go_file)
        print('building gaf')
        self.gaf = goatools_utils.buildGAF(gaf)
        print('done')
        if TermCountsFile is None:
            self.termcounts = TermCounts(self.go_file, self.gaf)
        else:
            self.termcounts = pickle.loads(open(TermCountsFile, 'rb').read())
        #make a partial
        self.resniksimpreconf = partial(goatools_utils.resnik_sim_pandas,
                                        df=self.godf,
                                        termcounts=self.termcounts)
Example #4
0
import networkx as nx
import pandas as pd
from matplotlib import pyplot as plt
import glob
from pyoma.browser import db
import pickle
from utils import config_utils

omadir = config_utils.omadir
db = db.Database(omadir + '/OmaServer.h5')

print('loading mapping')
experiments = ' fusion coexpression experiments textmining'
unidf = pd.read_csv('full_uniprot_2_string.04_2015.tsv',
                    delim_whitespace=True,
                    header=0)
unidf.columns = [col.split('|')[0].replace('#', '') for col in unidf.columns]
unidf['uniprot_code'] = unidf.uniprot_ac.map(lambda x: x.split('|')[0])
unidf['uniprot_ac'] = unidf.uniprot_ac.map(lambda x: x.split('|')[1])
omadf = pd.read_csv('oma-uniprot.txt',
                    delim_whitespace=True,
                    comment='#',
                    names=['oma', 'uniprot'])
print('done')

print('loading network files')

networks = glob.glob('./*protein.links.full*txt')
print(networks)

Example #5
0
    def __init__(self,
                 tarfile_ortho=None,
                 h5_oma=None,
                 taxa=None,
                 masterTree=None,
                 saving_name=None,
                 numperm=256,
                 treeweights=None,
                 taxfilter=None,
                 taxmask=None,
                 verbose=False):
        if h5_oma is not None:
            from pyoma.browser import db
            self.h5OMA = h5_oma
            self.db_obj = db.Database(h5_oma)
            self.oma_id_obj = db.OmaIdMapper(self.db_obj)

        elif tarfile_ortho:
            self.tar = tarfile_ortho
            self.h5OMA = None
            self.db_obj = None
            self.oma_id_obj = None

        self.tax_filter = taxfilter
        self.tax_mask = taxmask
        self.verbose = verbose
        self.datetime = datetime
        self.date_string = "{:%B_%d_%Y_%H_%M}".format(datetime.now())
        self.saving_name = saving_name

        #original_umask = os.umask(0)

        if saving_name:
            self.saving_path = config_utils.datadir + saving_name + '/'
            if not os.path.isdir(self.saving_path):
                os.mkdir(path=self.saving_path)
        else:

            self.saving_path = config_utils.datadir + self.date_string + '/'
            if not os.path.isdir(self.saving_path):

                os.mkdir(path=self.saving_path)

        if masterTree is None:
            if h5_oma:
                genomes = pd.DataFrame(
                    h5_oma.root.Genome.read())["NCBITaxonId"].tolist()
                genomes = [str(g) for g in genomes]
                taxa = genomes + [131567, 2759, 2157, 45596] + [
                    taxrel[0] for taxrel in list(h5_oma.root.Taxonomy[:])
                ] + [taxrel[1] for taxrel in list(h5_oma.root.Taxonomy[:])]
                self.tree_string, self.tree_ete3 = files_utils.get_tree(
                    taxa=taxa, genomes=genomes, savename=saving_name)
            elif taxa:
                with open(taxa, 'r') as taxin:
                    taxlist = [int(line) for line in taxin]
                self.tree_string, self.tree_ete3 = files_utils.get_tree(
                    taxa=taxlist, savename=saving_name)
            else:
                raise Exception(
                    'please specify either a list of taxa or a tree')
        elif mastertree:
            with open(masterTree, 'wb') as pklin:
                self.tree_ete3 = pickle.loads(pklin.read())
                self.tree_string = self.tree_ete3.write(format=1)

        self.taxaIndex, self.reverse = files_utils.generate_taxa_index(
            self.tree_ete3, self.tax_filter, self.tax_mask)
        with open(config_utils.datadir + 'taxaIndex.pkl', 'wb') as taxout:
            taxout.write(pickle.dumps(self.taxaIndex))
        self.numperm = numperm
        if treeweights is None:
            #generate aconfig_utilsll ones
            self.treeweights = hashutils.generate_treeweights(
                self.tree_ete3, self.taxaIndex, taxfilter, taxmask)
        else:
            #load machine learning weights
            self.treeweights = treeweights
        print(self.treeweights)
        wmg = WeightedMinHashGenerator(3 * len(self.taxaIndex),
                                       sample_size=numperm,
                                       seed=1)
        with open(self.saving_path + saving_name + 'wmg.pkl', 'wb') as taxout:
            taxout.write(pickle.dumps(self.taxaIndex))

        self.wmg = wmg
        self.HAM_PIPELINE = functools.partial(
            pyhamutils.get_ham_treemap_from_row, tree=self.tree_string)
        self.HASH_PIPELINE = functools.partial(hashutils.row2hash,
                                               taxaIndex=self.taxaIndex,
                                               treeweights=self.treeweights,
                                               wmg=wmg)
        if self.h5OMA:
            self.READ_ORTHO = functools.partial(pyhamutils.get_orthoxml_oma,
                                                db_obj=self.db_obj)
        elif self.tar:
            self.READ_ORTHO = pyhamutils.get_orthoxml
        self.hashes_path = self.saving_path + 'hashes.h5'
        self.lshpath = self.saving_path + 'newlsh.pkl'
        self.lshforestpath = self.saving_path + 'newlshforest.pkl'
        self.mat_path = self.saving_path + 'hogmat.h5'
        self.columns = len(self.taxaIndex)
from Bio.SeqRecord import SeqRecord
from pyoma.browser import db
import familyanalyzer as fa

# parameters
MIN_SPECIES = 20
DUP_RATIO = 0
DIR = '/Users/daviddylus/Research/read2tree/reference_datasets/Dataset1/Output/'

# read in files
hog_XML = DIR + 'HierarchicalGroups.orthoxml'
og_XML = DIR + 'OrthologousGroups.orthoxml'
h5file = open_file("/Volumes/Untitled/OmaServer.h5", mode="r")

genomeTab = h5file.root.Genome
dbObj = db.Database(h5file)
omaIdObj = db.OmaIdMapper(dbObj)

if DUP_RATIO != 0:
    hog_op = fa.OrthoXMLParser(hog_XML)
    gene_family_xml_nodes_hog = hog_op.getToplevelGroups()
    # select all the families with more than X species and duplication ratio smaller than Y
    hog_families_X = {}
    for i, family in enumerate(gene_family_xml_nodes_hog):
        family_id = family.get('id')
        genes_per_hog = [
            val for sublist in hog_op.getGenesPerSpeciesInFam(family).values()
            for val in sublist
        ]
        species_per_hog = hog_op.getGenesPerSpeciesInFam(family).keys()
        duplication_ratio = float(len(genes_per_hog)) / float(