Esempio n. 1
0
    def load_soft_series_family(self, filename):  # Load from soft data file for genes
        # SOFT files are a /sort of/ basterdized csv with data in tab-separated columns
        # So, we use the csv reader to get that, accounting for most stuff being single field with
        # slightly strange identifiers

        reader = csv.reader(open(filename, 'rU'), delimiter='\t', dialect='excel')
        soft_data = self.preprocess_soft(reader)

        database = {}
        platform = {}
        samples = {}
        sample_data = {}

        for section, rows in list(soft_data.items()):

            if section.startswith('^DATABASE'):
                database = self.get_soft_metadata(rows)

            elif section.startswith('^PLATFORM'):
                platform = self.get_soft_metadata(rows)
                platform_data = self.get_soft_data(rows, '!platform_table_begin', '!platform_table_end')

            elif section.startswith('^SAMPLE'):
                key, sample_id = row[0].split(' = ')
                samples[sample_id] = self.get_soft_metadata(rows)
                sample_data[sample_id] = self.get_soft_data(rows, '!sample_table_begin', '!sample_table_end')
        # We now have the entire dataseries loaded; but in a bit of a messed up format
        # Build a dataset object to fit and map the data in

        xdim = len(platform_data)  # Use first sample to access the gene list
        ydim = len(sample_data)

        # Build dataset object
        dso = DataSet(size=(xdim, ydim))  # self.add_data('imported_data', DataSetself) )
        dso.empty(size=(ydim, xdim))

        sample_ids = sorted(samples.keys())  # Get the samples sorted so we keep everything lined up
        gene_ids = sorted(platform_data.keys())  # Get the keys sorted so we keep everything lined up

        dso.labels[0] = sample_ids
        dso.labels[1] = [platform_data[gene_id]['UNIGENE'] for gene_id in gene_ids]
        dso.entities[1] = [self.m.db.get_via_unification('UNIGENE', gene_id) for gene_id in dso.labels[1]]

        for xn, gene_id in enumerate(gene_ids):
            for yn, sample_id in enumerate(sample_ids):

                dso.data[yn, xn] = sample_data[sample_id][gene_id]['VALUE']

        return dso
Esempio n. 2
0
    def load_datafile(self, filename):
        # Determine if we've got a csv or peakml file (extension)
        #self.data.o['output'].empty()
        dso = DataSet()

        # Read data in from peakml format file
        xml = et.parse(filename)

        # Get sample ids, names and class groupings
        sets = xml.iterfind('header/sets/set')
        midclass = {}
        classes = set()
        measurements = []
        masses = {}

        for aset in sets:
            id = aset.find('id').text
            mids = aset.find('measurementids').text
            for mid in self.decode(mids):
                midclass[mid] = id
                measurements.append(mid)

            classes.add(id)

        # We have all the sample data now, parse the intensity and identity info
        peaksets = xml.iterfind('peaks/peak')
        quantities = defaultdict(dict)
        all_identities = []

        for peakset in peaksets:

            # Find metabolite identities
            annotations = peakset.iterfind('annotations/annotation')
            identities = False
            for annotation in annotations:
                if annotation.find('label').text == 'identification':
                    identities = annotation.find('value').text.split(', ')
                    all_identities.extend(identities)
                    break

            if identities:
                # PeakML supports multiple alternative metabolite identities,currently we don't so duplicate
                # We have identities, now get intensities for the different samples
                chromatograms = peakset.iterfind(
                    'peaks/peak')  # Next level down

                for chromatogram in chromatograms:
                    mid = chromatogram.find('measurementid').text
                    intensity = float(chromatogram.find('intensity').text)
                    mass = float(chromatogram.find('mass').text)

                    # Write out to each of the identities table (need to buffer til we have the entire list)
                    for identity in identities:
                        quantities[mid][identity] = intensity

                    # Write out to each of the identities table (need to buffer til we have the entire list)
                    for identity in identities:
                        masses[identity] = mass

        # Sort the identities/masses into consecutive order

        # Quantities table built; class table built; now rearrange into dso
        dso.empty((len(measurements), len(all_identities)))
        dso.labels[0] = measurements
        dso.classes[0] = [midclass[mid] for mid in measurements]

        dso.labels[1] = all_identities
        db_hmdbids = self.m.db.unification['HMDB']
        dso.entities[1] = [
            db_hmdbids[hmdbid] if hmdbid in db_hmdbids else None
            for hmdbid in all_identities
        ]
        dso.scales[1] = [float(masses[i]) for i in all_identities]

        for mid, identities in list(quantities.items()):
            for identity, intensity in list(identities.items()):
                r = measurements.index(mid)
                c = all_identities.index(identity)

                dso.data[r, c] = intensity

        dso.name = os.path.basename(filename)
        dso.description = 'Imported PeakML file'
        self.set_name(dso.name)

        return {'output': dso}
Esempio n. 3
0
    def load_soft_dataset(self, filename):  # Load from soft data file for genes
        # SOFT files are a /sort of/ basterdized csv with data in tab-separated columns
        # So, we use the csv reader to get that, accounting for most stuff being single field with
        # slightly strange identifiers
        f = open(filename, 'rU')
        fsize = os.path.getsize(filename)
        reader = csv.reader(f, delimiter='\t', dialect='excel')

        soft_data = self.preprocess_soft(reader, f=f, fsize=fsize)
        # soft_data now contains lists of sections with ^ markers

        database = {}
        dataset = {}
        dataset_data = {}
        subsets = {}

        for section, rows in list(soft_data.items()):

            if section.startswith('^DATABASE'):
                database = self.get_soft_metadata(rows)

            elif section.startswith('^DATASET'):
                dataset.update(self.get_soft_metadata(rows))  # update because seems can be >1 entry to dataset
                data = self.get_soft_data(rows, '!dataset_table_begin', '!dataset_table_end')
                dataset_data = data

            elif section.startswith('^SUBSET'):
                key, subset_id = section.split(' = ')
                subsets[subset_id] = self.get_soft_metadata(rows)
                subsets[subset_id]['subset_sample_id'] = subsets[subset_id]['subset_sample_id'].split(',')  # Turn to list of ids

        # We now have the entire dataset loaded; but in a bit of a messed up format
        # Build a dataset object to fit and map the data in
        sample_ids = []
        for k, subset in list(subsets.items()):
            sample_ids.extend(subset['subset_sample_id'])
        sample_ids = sorted(sample_ids)  # Get the samples sorted so we keep everything lined up

        class_lookup = {}
        for class_id, s in list(subsets.items()):
            for s_id in s['subset_sample_id']:
                class_lookup[s_id] = "%s (%s)" % (s['subset_description'] if 'subset_description' in s else '', class_id)

        xdim = len(dataset_data)  # Use first sample to access the gene list
        ydim = len(sample_ids)

        # Build dataset object
        dso = DataSet(size=(xdim, ydim))  # self.add_data('imported_data', DataSetself) )
        dso.empty(size=(ydim, xdim))

        gene_ids = sorted(dataset_data.keys())  # Get the keys sorted so we keep everything lined up

        dso.labels[0] = sample_ids
        dso.classes[0] = [class_lookup[s_id] for s_id in sample_ids]
        dso.labels[1] = [dataset_data[gene_id]['IDENTIFIER'] for gene_id in gene_ids]
        dso.entities[1] = [self.m.db.get_via_synonym(gene_id) for gene_id in dso.labels[1]]

        for xn, gene_id in enumerate(gene_ids):
            for yn, sample_id in enumerate(sample_ids):

                dso.data[yn, xn] = dataset_data[gene_id][sample_id]

        return dso
Esempio n. 4
0
    def load_csv_C(
        self, filename
    ):  # Load from csv with experiments in COLUMNS, metabolites in ROWS
        # Read in data for the graphing metabolite, with associated value (generate mean)
        f = open(filename, 'rU')
        fsize = os.path.getsize(filename)
        reader = csv.reader(f, delimiter=str(','), dialect='excel')

        hrow = next(reader)  # Discard top row (sample no's)
        samples = hrow[1:]

        hrow = next(reader)  # Get 2nd row
        classesa = hrow[1:]
        classes = [c for c in classesa if c != '.']

        metabolites = []

        data = []

        added_rows = 0
        for n, row in enumerate(reader):
            metabolite = row[0]
            metabolites.append(row[0])
            quants = []
            for cn, c in enumerate(row[1:]):
                if classesa[cn] != '.':
                    try:
                        data.append(float(c))
                    except:
                        data.append(0)

            if n % 100 == 0:
                self.progress.emit(float(f.tell()) / fsize)

        data = np.asarray(data)
        data = np.reshape(data, (n + 1, len(classes))).T

        xdim = len(quants)
        ydim = len(classes)

        # Build dataset object
        dso = DataSet(
            size=(xdim, ydim))  # self.add_data('imported_data', DataSetself) )
        dso.empty(size=(ydim, xdim))
        dso.labels[1] = metabolites

        scales = []
        mlabels = []
        for m in metabolites:
            try:
                scales.append(float(m))
                mlabels.append(None)
            except:
                scales.append(None)
                mlabels.append(m)

        dso.scales[0] = [None] * len(samples)
        dso.labels[0] = samples
        dso.classes[0] = classes
        dso.entities[0] = [None] * len(samples)

        dso.scales[1] = scales
        dso.labels[1] = mlabels
        dso.classes[1] = [None] * len(scales)
        dso.entities[1] = [None] * len(scales)

        dso.data = data

        return dso
Esempio n. 5
0
    def load_csv_R(
        self, filename
    ):  # Load from csv with experiments in ROWS, metabolites in COLUMNS
        # Read in data for the graphing metabolite, with associated value (generate mean)
        f = open(filename, 'rU')
        fsize = os.path.getsize(filename)
        reader = csv.reader(f, delimiter=str(','), dialect='excel')
        print('R')
        hrow = next(reader)  # Get top row
        metabolites = hrow[2:]
        ydim = 0
        xdim = len(metabolites)

        samples = []
        classes = []
        raw_data = []

        # Build quants table for metabolite classes
        #for metabolite in self.metabolites:
        #    quantities[ metabolite ] = defaultdict(list)

        for n, row in enumerate(reader):
            ydim += 1
            if row[1] != '.':  # Skip excluded classes # row[1] = Class
                samples.append(row[0])
                classes.append(row[1])
                data_row = []
                for c in row[2:]:  # in self.metabolites:
                    try:
                        c = float(c)
                    except:
                        c = 0
                    data_row.append(c)

                raw_data.append(data_row)
                #metabolite_column = hrow.index( metabolite )
                #if row[ metabolite_column ]:
                #    data_row.append(
                #    quantities[metabolite][ row[1] ].append( float(row[ metabolite_column ]) )
                #self.statistics['ymin'] = min( self.statistics['ymin'], float(row[ metabolite_column ]) )
                #self.statistics['ymax'] = max( self.statistics['ymax'], float(row[ metabolite_column ]) )
                #else:
                #    quantities[metabolite][ row[1] ].append( 0 )
            else:
                pass

            if n % 100 == 0:
                self.progress.emit(float(f.tell()) / fsize)

                #self.statistics['excluded'] += 1

        # Build dataset object
        dso = DataSet(
            size=(xdim, ydim))  # self.add_data('imported_data', DataSetself) )
        dso.empty(size=(ydim, xdim))
        #dso.labels[1] = metabolites

        scales = []
        mlabels = []
        for m in metabolites:
            try:
                scales.append(float(m))
                mlabels.append(None)
            except:
                scales.append(None)
                mlabels.append(m)

        dso.scales[1] = [None] * len(samples)
        dso.labels[0] = samples
        dso.classes[0] = classes
        dso.entities[0] = [None] * len(samples)

        dso.scales[1] = scales
        dso.labels[1] = mlabels
        dso.entities[1] = [None] * len(scales)
        dso.classes[1] = [None] * len(scales)

        dso.data = np.array(raw_data)

        return dso
Esempio n. 6
0
    def load_datafile(self, filename):
        # Determine if we've got a csv or peakml file (extension)
        #self.data.o['output'].empty()
        dso = DataSet()

        # Read data in from peakml format file
        xml = et.parse(filename)

        # Get sample ids, names and class groupings
        sets = xml.iterfind('header/sets/set')
        midclass = {}
        classes = set()
        measurements = []
        masses = {}

        for aset in sets:
            id = aset.find('id').text
            mids = aset.find('measurementids').text
            for mid in self.decode(mids):
                midclass[mid] = id
                measurements.append(mid)

            classes.add(id)

        # We have all the sample data now, parse the intensity and identity info
        peaksets = xml.iterfind('peaks/peak')
        quantities = defaultdict(dict)
        all_identities = []

        for peakset in peaksets:

        # Find metabolite identities
            annotations = peakset.iterfind('annotations/annotation')
            identities = False
            for annotation in annotations:
                if annotation.find('label').text == 'identification':
                    identities = annotation.find('value').text.split(', ')
                    all_identities.extend(identities)
                    break

            if identities:
                # PeakML supports multiple alternative metabolite identities,currently we don't so duplicate
                # We have identities, now get intensities for the different samples
                chromatograms = peakset.iterfind('peaks/peak')  # Next level down

                for chromatogram in chromatograms:
                    mid = chromatogram.find('measurementid').text
                    intensity = float(chromatogram.find('intensity').text)
                    mass = float(chromatogram.find('mass').text)

                    # Write out to each of the identities table (need to buffer til we have the entire list)
                    for identity in identities:
                        quantities[mid][identity] = intensity

                    # Write out to each of the identities table (need to buffer til we have the entire list)
                    for identity in identities:
                        masses[identity] = mass

        # Sort the identities/masses into consecutive order


        # Quantities table built; class table built; now rearrange into dso
        dso.empty((len(measurements), len(all_identities)))
        dso.labels[0] = measurements
        dso.classes[0] = [midclass[mid] for mid in measurements]

        dso.labels[1] = all_identities
        db_hmdbids = self.m.db.unification['HMDB']
        dso.entities[1] = [db_hmdbids[hmdbid] if hmdbid in db_hmdbids else None for hmdbid in all_identities]
        dso.scales[1] = [float(masses[i]) for i in all_identities]

        for mid, identities in list(quantities.items()):
            for identity, intensity in list(identities.items()):
                r = measurements.index(mid)
                c = all_identities.index(identity)

                dso.data[r, c] = intensity

        dso.name = os.path.basename(filename)
        dso.description = 'Imported PeakML file'
        self.change_name.emit(dso.name)

        return {'output': dso}
Esempio n. 7
0
    def load_csv_C(self, filename):  # Load from csv with experiments in COLUMNS, metabolites in ROWS
        # Read in data for the graphing metabolite, with associated value (generate mean)
        f = open(filename, 'rU')
        fsize = os.path.getsize(filename)
        reader = csv.reader(f, delimiter=str(','), dialect='excel')

        hrow = next(reader)  # Discard top row (sample no's)
        samples = hrow[1:]

        hrow = next(reader)  # Get 2nd row
        classesa = hrow[1:]
        classes = [c for c in classesa if c != '.']

        metabolites = []

        data = []

        added_rows = 0
        for n, row in enumerate(reader):
            metabolite = row[0]
            metabolites.append(row[0])
            quants = []
            for cn, c in enumerate(row[1:]):
                if classesa[cn] != '.':
                    try:
                        data.append(float(c))
                    except:
                        data.append(0)

            if n % 100 == 0:
                self.progress.emit(float(f.tell()) / fsize)

        data = np.asarray(data)
        data = np.reshape(data, (n + 1, len(classes))).T

        xdim = len(quants)
        ydim = len(classes)

        # Build dataset object
        dso = DataSet(size=(xdim, ydim))  # self.add_data('imported_data', DataSetself) )
        dso.empty(size=(ydim, xdim))
        dso.labels[1] = metabolites

        scales = []
        mlabels = []
        for m in metabolites:
            try:
                scales.append(float(m))
                mlabels.append(None)
            except:
                scales.append(None)
                mlabels.append(m)

        dso.scales[0] = [None] * len(samples)
        dso.labels[0] = samples
        dso.classes[0] = classes
        dso.entities[0] = [None] * len(samples)

        dso.scales[1] = scales
        dso.labels[1] = mlabels
        dso.classes[1] = [None] * len(scales)
        dso.entities[1] = [None] * len(scales)

        dso.data = data

        return dso
Esempio n. 8
0
    def load_csv_R(self, filename):  # Load from csv with experiments in ROWS, metabolites in COLUMNS
        # Read in data for the graphing metabolite, with associated value (generate mean)
        f = open(filename, 'rU')
        fsize = os.path.getsize(filename)
        reader = csv.reader(f, delimiter=str(','), dialect='excel')
        print('R')
        hrow = next(reader)  # Get top row
        metabolites = hrow[2:]
        ydim = 0
        xdim = len(metabolites)

        samples = []
        classes = []
        raw_data = []

        # Build quants table for metabolite classes
        #for metabolite in self.metabolites:
        #    quantities[ metabolite ] = defaultdict(list)

        for n, row in enumerate(reader):
            ydim += 1
            if row[1] != '.':  # Skip excluded classes # row[1] = Class
                samples.append(row[0])
                classes.append(row[1])
                data_row = []
                for c in row[2:]:  # in self.metabolites:
                    try:
                        c = float(c)
                    except:
                        c = 0
                    data_row.append(c)

                raw_data.append(data_row)
                    #metabolite_column = hrow.index( metabolite )
                    #if row[ metabolite_column ]:
                    #    data_row.append(
                    #    quantities[metabolite][ row[1] ].append( float(row[ metabolite_column ]) )
                        #self.statistics['ymin'] = min( self.statistics['ymin'], float(row[ metabolite_column ]) )
                        #self.statistics['ymax'] = max( self.statistics['ymax'], float(row[ metabolite_column ]) )
                    #else:
                    #    quantities[metabolite][ row[1] ].append( 0 )
            else:
                pass

            if n % 100 == 0:
                self.progress.emit(float(f.tell()) / fsize)

                #self.statistics['excluded'] += 1

        # Build dataset object
        dso = DataSet(size=(xdim, ydim))  # self.add_data('imported_data', DataSetself) )
        dso.empty(size=(ydim, xdim))
        #dso.labels[1] = metabolites

        scales = []
        mlabels = []
        for m in metabolites:
            try:
                scales.append(float(m))
                mlabels.append(None)
            except:
                scales.append(None)
                mlabels.append(m)

        dso.scales[1] = [None] * len(samples)
        dso.labels[0] = samples
        dso.classes[0] = classes
        dso.entities[0] = [None] * len(samples)

        dso.scales[1] = scales
        dso.labels[1] = mlabels
        dso.entities[1] = [None] * len(scales)
        dso.classes[1] = [None] * len(scales)

        dso.data = np.array(raw_data)

        return dso