Ejemplo n.º 1
0
    def export( self ):
        print( "Exporting..." )

        # Get all transcript IDs
        results = db_session.query( Transcript.id ).all()

        n = 0

        target_filepath = settings.data_folder + "/transcripts.fasta"

        output_handle = open( target_filepath, "w" )
        for result in results:
            transcript_id = result[ 0 ]
            seq_record = Transcript( transcript_id ).get_sequence()

            if seq_record is None:
                print( "Missing sequence for [" + transcript_id + "]" )
                continue

            seq_record.id = transcript_id
            SeqIO.write( seq_record, output_handle, "fasta" )

            n += 1
            if n % 100 == 0:
                print( "[{}] sequences written".format( n ) )

        output_handle.close()

        print( "...Saved transcripts to [{}]".format( target_filepath ) )
Ejemplo n.º 2
0
def insert_db(urls):
    for url in urls:
        jsresp = json.loads(requests.get(url).content)['anforande']
        t = Transcript()
        t.transcript_id = jsresp.get("anforande_id")
        t.transcript = jsresp.get("anforandetext")
        t.speaker_id = jsresp.get("intressent_id")
        t.speaker_title = jsresp.get("talare")
        t.party = jsresp.get("parti")
        t.section = jsresp.get("avsnittsrubrik")
        t.date = jsresp.get("dok_datum")
        db.session.add(t)
Ejemplo n.º 3
0
def create_student(
        school_id):  # Students must be registered through their school
    if request.method == "POST":
        first_name = request.form["first_name"]
        last_name = request.form["last_name"]
        email = request.form["email"]
        password = request.form["password"]
        # confirm_password = request.form["confirm_password"]
        if not email.index("@") > 0 and not email.index(".") > email.index(
                "@"):
            return redirect(url_for('.home_page'))
        # if not confirm_password == password:
        #     return redirect(url_for('.home_page'))
        u = Student.query.filter_by(email=email).count()
        if u == 0:
            new_student = Student(first_name, last_name, email, password,
                                  int(school_id))
            db_session.add(new_student)
            db_session.commit()
            transcript = Transcript(new_student.id, int(school_id))
            db_session.add(transcript)
            db_session.commit()
            log('Student Created')
            return redirect(url_for('.home_page'))
        return redirect(url_for('.home_page'))
    if request.method == "GET":
        return render_template('student_register.html')
Ejemplo n.º 4
0
    def process_transcript_id(self, transcript_id):
        print("Aligning ["+transcript_id+"]...")
        sys.stdout.flush()

        seqs_to_align = list(Transcript(transcript_id).get_sequences().values())

        if len(seqs_to_align) <= 1:
            print("Warning - not enough sequences to proceed with alignment")
            return

        temp_filepath = settings.temp_folder+"/tmp.fa"

        # output to a fasta file for clustalw alignment
        output_handle = open(temp_filepath, "w")
        SeqIO.write(seqs_to_align, output_handle, "fasta")
        output_handle.close()

        # run the clustalw alignment
        clustalw_cline = ClustalwCommandline("clustalw2", infile=temp_filepath, quicktree=True)
        results = clustalw_cline()

        # parse the results into the database
        entries = AlignIO.read(settings.temp_folder+"/tmp.aln", "clustal")
        for entry in entries:
            obj = AlignmentEntry(transcript_id, entry.id, str(entry.seq))
            db_session.add(obj)
            
        db_session.commit()

        print("Aligned")
Ejemplo n.º 5
0
def preprocess_all_transcripts():
    for dir_path in __list_dir(TRANSCRIPTS_DIR_PATH):
        term = dir_path[-4:]

        if VERBOSE: logging.info("Preprocessing term %s ..." % term)
        transcript_count = 0

        for file_path in __list_dir(dir_path):
            file_name = os.path.basename(file_path)
            if Transcript.get_or_none(
                    Transcript.file_name == file_name) is None:
                if VERBOSE:
                    logging.info("Preprocessing document %s/%s ..." %
                                 (term, file_name))
                transcript = __preprocess_transcript(file_path)

                if VERBOSE:
                    logging.info(
                        "Done preprocessing document %s/%s. Parsed %s petitioner statements and %s repondent statements."
                        % (term, file_name,
                           len(transcript.petitioner_statements()),
                           len(transcript.respondent_statements())))

            transcript_count += 1

        if VERBOSE:
            logging.info("Done preprocessing term %s. Parsed %s transcripts." %
                         (term, transcript_count))
Ejemplo n.º 6
0
def print_coverage_stats():
    total_count = Case.select().count()
    has_transcript_count = Case.select().where(
        Case.transcript.is_null(False)).count()
    coverage = float(has_transcript_count) / float(total_count)
    logging.info(
        "There are %s cases and %s cases with transcripts, for a coverage of %s%s."
        % (total_count, has_transcript_count, round(coverage * 100.0, 2), "%"))
    total_count = Transcript.select().count()
    has_case_count = len([
        transcript for transcript in Transcript.select()
        if transcript.cases.count() > 0
    ])
    coverage = float(has_case_count) / float(total_count)
    logging.info(
        "There are %s transcripts and %s transcripts with cases, for a coverage of %s%s."
        % (total_count, has_case_count, round(coverage * 100.0, 2), "%"))
Ejemplo n.º 7
0
def __preprocess_transcript(file_path):
    dir_path, file_name = os.path.split(file_path)
    meta_dir_path, dir_name = os.path.split(dir_path)
    term = int(dir_name)

    docket = None
    FILE_PATH_REGEX = "(\d\d\-\d+)(?:_[^\.]+)?(?:\[Reargued\])?\.pdf"
    matches = re.findall(FILE_PATH_REGEX, file_name)
    if len(matches) == 0:
        logging.info("Regex didn't match file name: %s." % file_name)
    elif len(matches) == 1:
        docket = matches[0]
    else:
        docket = matches[0]
        logging.info("Regex matched file name more than once: %s." % file_name)

    petitioner_statements, respondent_statements, raw_text, red_flags = __extract_statements(
        file_path)
    transcript = Transcript(raw_text=raw_text,
                            term=term,
                            docket=docket,
                            file_name=file_name)
    transcript = transcript.get_or_create()

    for statement in petitioner_statements:
        paragraphs = statement.temp_paragraphs[:]
        statement.transcript = transcript
        statement.speaker_is_petitioner = True
        statement = statement.get_or_create()
        for paragraph in paragraphs:
            statement.add_paragraph(paragraph)

    for statement in respondent_statements:
        paragraphs = statement.temp_paragraphs[:]
        statement.transcript = transcript
        statement.speaker_is_respondent = True
        statement = statement.get_or_create()
        for paragraph in paragraphs:
            statement.add_paragraph(paragraph)

    for gloss in red_flags:
        transcript.add_red_flag(gloss)

    return transcript
Ejemplo n.º 8
0
	def export(self):
		print("Exporting...")

		from Bio import SeqIO
		from database import db_session
		from models import Transcript
		import settings 

		# Get all transcript IDs
		results = db_session \
			.query(Transcript.id) \
			.all()

		n = 0

		target_filepath = settings.data_folder+"/transcripts.fasta"

		output_handle = open(target_filepath, "w")
		for result in results:
			transcript_id = result[0]
			seq_record = Transcript(transcript_id).get_sequence()
			
			if seq_record == None:
				print ("Missing sequence for ["+transcript_id+"]")
				continue

			seq_record.id = transcript_id
			SeqIO.write(seq_record, output_handle, "fasta")

			n += 1
			if n % 100 == 0:
				print("["+str(n)+"] sequences written")

		output_handle.close()

		print("...Saved transcripts to ["+target_filepath+"]")
Ejemplo n.º 9
0
def reconciliate_cases_and_transcripts():
    case_dockets = {}
    transcript_dockets = {}
    for case in Case.select():
        try:
            case_dockets[preprocess_docket(case.docket)] = case
        except:
            continue

    for transcript in Transcript.select():
        try:
            transcript_dockets[preprocess_docket(
                transcript.docket)] = transcript
        except:
            continue

    for docket in case_dockets:
        if docket in transcript_dockets:
            case = case_dockets[docket]
            case.transcript = transcript_dockets[docket]
            case.save()
Ejemplo n.º 10
0
    def get_normalised(self):
        # Grab sequence string
        seq_str = Transcript(self.transcript_id).get_sequence_str()

        # Use the ORM to grab all the normalised stuff
        results = db_session \
            .query(NucleotideMeasurementSet) \
            .filter(
                NucleotideMeasurementSet.nucleotide_measurement_run_id==self.nucleotide_measurement_run_id,
                NucleotideMeasurementSet.transcript_id==self.transcript_id
            ) \
            .all()

        measurement_set = results[0]
        # TODO detect whether float or int and use the correct unpacker.
        # Needed for raw count values download option
        unpacked = values_str_unpack_float(measurement_set.values)

        # index measurements by pos
        measurements = {}
        for pos in range(0, len(unpacked)):
            value = unpacked[pos]
            measurements[pos + 1] = "NA" if value == None else value

        # build the output string
        buf = ""
        n = 0
        for n in range(0, len(seq_str)):
            pos = n + 1
            measurement = "NA" if pos not in measurements else measurements[pos]
            buf +=  str(pos)+"\t"+ \
                    seq_str[n]+"\t"+ \
                    str(measurement)+"\n"
            n += 1

        return buf
Ejemplo n.º 11
0
def create_signatures(request):
    """Generates signatures from profiles."""
    # Sort profiles according to tissues
    # compare DR vs. AL.
    profiles = Profile.objects.all()
    #print len(profiles)
    signatures = {}
    for profile in profiles:
        tissues = ' '.join([tissue.name for tissue in profile.tissue.all()])
        print tissues, profile.diet.shortcut
        if tissues not in signatures:
            signatures[tissues] = [None, None]
        if profile.diet.shortcut == 'DR':
            signatures[tissues][0]= profile
        else:
            signatures[tissues][1] = profile
    print signatures

    for tissues, profiles in signatures.items():
        print tissues, profiles
        signature = Signature(name=tissues, species=profiles[0].species, diet=profiles[0].diet)
        signature.save()
        for tissue in profiles[0].tissue.all():
            signature.tissues.add(tissue)
        for profile in profiles:
            #background = []
            profile.transcripts = {}
            probes = Probe.objects.filter(profile=profile)
            for probe in probes:
                if not probe.name.startswith('RANDOM'):
                    transcript_name = probe.name.split('P')[0]
                    if transcript_name not in profile.transcripts:
                        profile.transcripts[transcript_name] = [probe.expression]
                    else:
                        profile.transcripts[transcript_name].append(probe.expression)
                #else: # For background subtraction.
                    #background.append(probe.expression)

        for transcript_name, exp_expression in profiles[0].transcripts.items():
            # If expression too low of e.g. 1/3 of probes, exclude probe.
            # RMA (background subtraction, quantile normalization, and median polishing)
            # Benjamini p-value

            exp = sum(exp_expression)/len(exp_expression)
            ctr_expression = profiles[1].transcripts[transcript_name]
            ctr = sum(ctr_expression)/len(ctr_expression)
            ratio = exp/ctr
            if ratio < 1: fold_change = -(1/ratio)
            else: fold_change = ratio
            if len(exp_expression) == 1 or len(ctr_expression) == 1:
                es = pvalue = None
            else:
                es = effect_size(exp_expression, ctr_expression)
                pvalue = t_two_sample(exp_expression, ctr_expression)[1] # Calculate p-value.

            transcript = Transcript(seq_id=transcript_name,
                                    ratio=ratio,
                                    fold_change=fold_change,
                                    effect_size=es,
                                    pvalue=pvalue)
            transcript.save()
            expression = Expression.objects.create(signature=signature, transcript=transcript,
                                           exp=exp, ctr=ctr, ratio=ratio, fold_change=fold_change,
                                            effect_size=es, pvalue=pvalue)
    print('Done')
    return redirect('/expressions/signatures/')
Ejemplo n.º 12
0
    def get_raw(self):
        seq_str = Transcript(self.transcript_id).get_sequence_str()

        # Use the ORM to grab compiled counts
        results = db_session \
            .query(RawReactivities) \
            .filter(
                RawReactivities.nucleotide_measurement_run_id==self.nucleotide_measurement_run_id,
                RawReactivities.transcript_id==self.transcript_id
            ) \
            .all()

        measurement_set = results[0]
        # minus_unpacked =
        # plus_unpacked = values_str_unpack_int(measurement_set.plus_values)

        cols = [
            values_str_unpack_int(measurement_set.minus_values),
            values_str_unpack_int(measurement_set.plus_values)
        ]

        # Grab the raw replicate lanes data
        lanes = db_session \
            .query(RawReplicateCounts) \
            .filter(
                RawReplicateCounts.nucleotide_measurement_run_id==self.nucleotide_measurement_run_id,
                RawReplicateCounts.transcript_id==self.transcript_id
            ) \
            .order_by(
                RawReplicateCounts.minusplus_id,
                RawReplicateCounts.bio_replicate_id,
                RawReplicateCounts.tech_replicate_id
            ) \
            .all()

        # gather the data
        tech_rep_ids = set()
        for lane in lanes:
            cols.append(values_str_unpack_int(lane.values))
            tech_rep_ids.add(lane.tech_replicate_id)

        # make headers
        headers = []
        for lane in lanes:
            # tech replicate notation only added for experiments with > 1 tech replicate
            tech_str = "" if len(tech_rep_ids) == 1 else "_T" + str(
                lane.tech_replicate_id)
            headers.append(
                str(lane.minusplus_id) + "_B" + str(lane.bio_replicate_id) +
                tech_str)

        # Build and return the output
        buf = "position\tsequence\tsum_minus\tsum_plus\t" + "\t".join(
            headers) + "\n"
        for n in range(0, len(cols[0])):
            # add position and seq letter
            buf += str(n + 1) + "\t" + seq_str[n]
            for col in cols:  # add the dynamic columns
                buf += "\t" + str(int(col[n]))
            buf += "\n"
        return buf
Ejemplo n.º 13
0
    def build_entries(self, experiment_ids):
        from models import NucleotideMeasurementRun

        # Load experiments
        experiments = db_session \
            .query(NucleotideMeasurementRun) \
            .filter(NucleotideMeasurementRun.id.in_(experiment_ids)) \
            .all()

        # Load measurements
        seq_str = str(
            Transcript(self.transcript_id).get_sequence(self.strain_id).seq)
        measurements_data = db_session \
            .query(NucleotideMeasurementSet) \
            .filter(
                NucleotideMeasurementSet.nucleotide_measurement_run_id.in_(experiment_ids),
                NucleotideMeasurementSet.transcript_id==self.transcript_id
            ) \
            .all()

        data = {}

        # Populate experiment rows
        for experiment in experiments:
            experiment_data = {
                "id": experiment.id,
                "description": experiment.description,
                "data": []
            }

            for n in range(len(seq_str)):  # initialise the array
                experiment_data["data"].append({
                    "position": n,
                    "nuc": seq_str[n],
                    "measurement": None
                })
            data[experiment.id] = experiment_data

        # Add measurements to each experiment json element
        # Loop since we might be dealing with > 1 measurement set
        for measurement_set in measurements_data:
            experiment_id = measurement_set.nucleotide_measurement_run_id
            measurements = values_str_unpack_float(measurement_set.values)

            for pos in range(0, len(measurements)):
                measurement = measurements[pos]
                data[experiment_id]["data"][pos]["measurement"] = measurement

        # For each experiment, check whether there is no data and set empty flags accordingly.
        self.empty = True  # all empty flag
        for experiment_id in data:
            entry = data[experiment_id]

            empty = True
            for pos in entry["data"]:
                if pos["measurement"] != 0 and pos["measurement"] != None:
                    empty = False
                    self.empty = False

            if empty:
                del entry["data"]
                entry["empty"] = True
            else:
                entry["empty"] = False

        self.data_json = json.dumps(data)
Ejemplo n.º 14
0
    def execute_gene(self, feature_rows, strain_id):
        features = {}
        sequence = None
        transcript = None

        gene_id = None
        min_start = None
        max_end = None

        for feature_row in feature_rows: # Loop through annotation rows in the gff file, all related to the current gene

            # keep track of start and end
            start = feature_row[3]
            end = feature_row[4]
            direction = "forward" if feature_row[6] == "+" else "reverse"
            chromosome_id = feature_row[0]

            feature_type = feature_row[2]
            attribs = feature_row[8].strip()

            # This causes bugs.
            # if feature_type == "gene": # Handle gene entries
                # gene_id = attribs.split(";")[0].split(":")[1] # grab the gene ID - we'll want this for later

            new_gene_id = self.find_attribs_value("ID=Gene", attribs)
            if new_gene_id != None:

                # only deal with proper genes. setting gene_id to None means nothing else will be processed.
                # so it will essentially skip non-"gene" entries.
                if feature_type != "gene":
                    gene_id = None
                    continue

                # Check against filter list if there is one
                if self.filter_genes != None and new_gene_id not in self.filter_genes:
                    # filter list exists, and gene is not in filter list
                    # skip this gene
                    return

                gene_id = new_gene_id

                # add the Gene entry - if it hasn't been already
                if gene_id not in self.genes_seen: 
                    gene = Gene(gene_id)
                    self.genes_to_write.append(gene)
                    self.genes_seen[gene_id] = gene
            
            elif gene_id != None : # Handle transcript entries - if the gene is legit
                transcript_id = self.find_attribs_value("ID=Transcript", attribs)
                if transcript_id != None: # it's a transcript entry

                    # add the Transcript entry - if it hasn't been already
                    transcript_id = self.ensure_unique_transcript_id(transcript_id)

                    if transcript_id not in self.transcripts_seen: 
                        transcript = Transcript(
                            id=transcript_id, gene_id=gene_id
                        )
                        self.transcripts_to_write.append(transcript)
                        self.transcripts_seen[transcript.id] = transcript

                else: # Handle transcript feature entries

                    # for some reason, features for a given strain/transcript 
                    # combination are not always added

                    transcript_id = self.find_attribs_value("Parent=Transcript", attribs)

                    if transcript_id != None: # it's a transcript feature entry
                        # put a filter here? some elements are not worth storing?
                        self.features_to_write.append(Feature(
                            transcript_id=transcript_id,
                            type_id=feature_row[2],
                            strain_id=strain_id,
                            chromosome_id=chromosome_id,
                            start=start,
                            end=end,
                            direction=direction
                        ))

                    else:
                        pass # this happens for pseudogenes and TEs - which we aint interested in
Ejemplo n.º 15
0
def add_signature(request):
    """The aim is to retrieve a list of differential expressed genes for certain
    criteria (e.g. fold_change, p-value, tissue).
    """
    form = SignatureForm(request.POST or None, request.FILES or None)
    if request.POST:
        if not "file" in request.POST:
            file = request.FILES['file']
            file.name = file.name.replace('.txt', '')
            data = file.read().replace('\r', '').split('\n')
        elif "profile" not in request:
            msg = "No file or profiles selected. Please provide either a signature "\
                  "file to upload or select profiles to derive a signature."
            messages.add_message(request, messages.ERROR, ugettext(msg))
            return redirect('/expressions/signature/add/')

        # Inferre descriptive informations from the filename:
        if file.name.startswith('name='):
            info = dict([item.split('=') for item in file.name.split(';')])
        if 'tissue' in info:
            tissues = info['tissue'].replace('-', '@').replace(
                ', ', '@').replace(' and ', '@').split(
                    '@')  # @ is unlikely to be used as filename.
        else:
            tissues = request.POST.getlist('tissues')
        if 'diet' in request.POST and request.POST['diet']:
            regimen = Regimen.objects.get(pk=request.POST['diet'])
        elif "diet" in info:
            regimen = Regimen.objects.get(shortcut__exact=info['diet'])

        # Species from form:
        try:
            species = Species.objects.get(pk=request.POST['species'])
        except ValueError as e:
            msg = "Species not found in Denigma db. %s. Please select a species." % e
            messages.add_message(request, messages.ERROR, ugettext(msg))
            return redirect('/expressions/signature/add/')

        # Create signature:
        signature = Signature(name=request.POST['name'] or info['name'],
                              diet=regimen,
                              species=species)  #,
        signature.save()

        # Adding tissues:
        for tissue in tissues:
            try:
                tissue = Tissue.objects.get(
                    pk=tissue)  #if it is selected from form
            except:
                print "Did not found tissue by pk."
                try:
                    tissue = Tissue.objects.get(
                        name__iexact=tissue
                    )  # If it is inferred from file name.
                except Tissue.DoesNotExist as e:
                    messages.add_message(
                        request, messages.ERROR,
                        ugettext("%s: %s" % (str(e)[:-1], tissue)))
                    return redirect('/expressions/signature/add/')

            signature.tissues.add(tissue)
        print "Tissues:", signature.tissues.all()

        header = {}
        for index, column in enumerate(data[0].split('\t')):
            if "DR" in column: column = "exp"
            elif "AL" in column: column = "ctr"

            header[column.lower().replace('gene symbol', 'symbol')\
                                 .replace('gene_symbol', 'symbol')\
                                 .replace(' ', '_')\
                                 .replace('platform_cloneid', 'seq_id')\
                                 .replace('ensembl_gene', 'seq_id')] = index # WTF is this?

        #num_lines = len(data); counter = 0
        print len(data[1:])
        for line in data[1:]:
            #print(line)
            #print(header)
            try:
                #print("Trying")
                # For effect size
                ctr_values = []
                exp_values = []

                #counter += 1
                if not line: continue
                columns = line.split('\t')
                if len(columns) < len(header): continue  #break #
                seq_id = columns[header['seq_id']]
                symbol = columns[header['symbol']]
                if symbol == "None": symbol = None
                ctr = float(columns[header['ctr']])
                exp = float(columns[header['exp']])
                if "ratio" in header:
                    ratio = float(columns[header['ratio']])
                    if ratio < 1:
                        fold_change = -(1 / ratio)
                    else:
                        fold_change = ratio
                else:
                    ratio = float(
                        columns[header['fold_change']])  # 2**exp/2**ctr
                if ratio < 1:
                    fold_change = -(1 / ratio)
                else:
                    fold_change = ratio
                # Calculating effect size:
                for k, v in header.items():
                    if k.startswith('ctr') and k != 'ctr':
                        ctr_values.append(float(columns[v]))
                    elif k.startswith('exp') and k != 'exp':
                        exp_values.append(float(columns[v]))


#                if exp_values and exp_values != ctr_values:
#                    #print exp_values
#                    es = effect_size(exp_values, ctr_values)
#                else:
                es = None
                #                if 'pvalue' in header:
                #                    pvalue = columns[header['p_value']]
                #                else:
                if exp_values != ctr_values:
                    pvalue = t_two_sample(ctr_values, exp_values)[1]
                else:
                    pvalue = 1

                transcript = Transcript(seq_id=seq_id,
                                        symbol=symbol,
                                        ratio=ratio,
                                        fold_change=fold_change,
                                        pvalue=pvalue,
                                        effect_size=es)

                transcript.save()
                #print(transcript.id, transcript.symbol, transcript.ratio)
                expression = Expression.objects.create(signature=signature,
                                                       transcript=transcript,
                                                       exp=exp,
                                                       ctr=ctr,
                                                       ratio=ratio,
                                                       fold_change=fold_change,
                                                       pvalue=pvalue,
                                                       effect_size=es)
                #print expression
            except ValueError as e:
                print e, symbol, seq_id, fold_change, pvalue, ctr, exp
                #break

        #print "Counter=%s; Number of lines:%s" % (counter, num_lines)
        #if counter == num_lines:
        msg = "Successfully integrated signature: %s" % signature.name
        msg_type = messages.SUCCESS
        #else:
        #    msg = "File upload failed."
        #    msg_type = messages.ERROR
        messages.add_message(request, msg_type, ugettext(msg))
        redirect('/expressions/signatures/')

    ctx = {'form': form, 'action': 'Add'}
    return render_to_response('expressions/signature_form.html',
                              ctx,
                              context_instance=RequestContext(request))
Ejemplo n.º 16
0
def create_signatures(request):
    """Generates signatures from profiles."""
    # Sort profiles according to tissues
    # compare DR vs. AL.
    profiles = Profile.objects.all()
    #print len(profiles)
    signatures = {}
    for profile in profiles:
        tissues = ' '.join([tissue.name for tissue in profile.tissue.all()])
        print tissues, profile.diet.shortcut
        if tissues not in signatures:
            signatures[tissues] = [None, None]
        if profile.diet.shortcut == 'DR':
            signatures[tissues][0] = profile
        else:
            signatures[tissues][1] = profile
    print signatures

    for tissues, profiles in signatures.items():
        print tissues, profiles
        signature = Signature(name=tissues,
                              species=profiles[0].species,
                              diet=profiles[0].diet)
        signature.save()
        for tissue in profiles[0].tissue.all():
            signature.tissues.add(tissue)
        for profile in profiles:
            #background = []
            profile.transcripts = {}
            probes = Probe.objects.filter(profile=profile)
            for probe in probes:
                if not probe.name.startswith('RANDOM'):
                    transcript_name = probe.name.split('P')[0]
                    if transcript_name not in profile.transcripts:
                        profile.transcripts[transcript_name] = [
                            probe.expression
                        ]
                    else:
                        profile.transcripts[transcript_name].append(
                            probe.expression)
                #else: # For background subtraction.
                #background.append(probe.expression)

        for transcript_name, exp_expression in profiles[0].transcripts.items():
            # If expression too low of e.g. 1/3 of probes, exclude probe.
            # RMA (background subtraction, quantile normalization, and median polishing)
            # Benjamini p-value

            exp = sum(exp_expression) / len(exp_expression)
            ctr_expression = profiles[1].transcripts[transcript_name]
            ctr = sum(ctr_expression) / len(ctr_expression)
            ratio = exp / ctr
            if ratio < 1: fold_change = -(1 / ratio)
            else: fold_change = ratio
            if len(exp_expression) == 1 or len(ctr_expression) == 1:
                es = pvalue = None
            else:
                es = effect_size(exp_expression, ctr_expression)
                pvalue = t_two_sample(exp_expression,
                                      ctr_expression)[1]  # Calculate p-value.

            transcript = Transcript(seq_id=transcript_name,
                                    ratio=ratio,
                                    fold_change=fold_change,
                                    effect_size=es,
                                    pvalue=pvalue)
            transcript.save()
            expression = Expression.objects.create(signature=signature,
                                                   transcript=transcript,
                                                   exp=exp,
                                                   ctr=ctr,
                                                   ratio=ratio,
                                                   fold_change=fold_change,
                                                   effect_size=es,
                                                   pvalue=pvalue)
    print('Done')
    return redirect('/expressions/signatures/')
Ejemplo n.º 17
0
def add_signature(request):
    """The aim is to retrieve a list of differential expressed genes for certain
    criteria (e.g. fold_change, p-value, tissue).
    """
    form = SignatureForm(request.POST or None, request.FILES or None)
    if request.POST:
        if not "file" in request.POST:
            file = request.FILES['file']
            file.name = file.name.replace('.txt', '')
            data = file.read().replace('\r', '').split('\n')
        elif "profile" not in request:
            msg = "No file or profiles selected. Please provide either a signature "\
                  "file to upload or select profiles to derive a signature."
            messages.add_message(request, messages.ERROR, ugettext(msg))
            return redirect('/expressions/signature/add/')

        # Inferre descriptive informations from the filename:
        if file.name.startswith('name='):
            info = dict([item.split('=') for item in file.name.split(';')])
        if 'tissue' in info:
            tissues = info['tissue'].replace('-', '@').replace(', ', '@').replace(' and ', '@').split('@') # @ is unlikely to be used as filename.
        else:
            tissues = request.POST.getlist('tissues')
        if 'diet' in request.POST and request.POST['diet']:
            regimen = Regimen.objects.get(pk=request.POST['diet'])
        elif "diet" in info:
            regimen = Regimen.objects.get(shortcut__exact=info['diet'])


        # Species from form:
        try:
            species = Species.objects.get(pk=request.POST['species'])
        except ValueError as e:
            msg = "Species not found in Denigma db. %s. Please select a species." % e
            messages.add_message(request, messages.ERROR, ugettext(msg))
            return redirect('/expressions/signature/add/')

        # Create signature:
        signature = Signature(name=request.POST['name'] or info['name'], diet=regimen, species=species)#,
        signature.save()

        # Adding tissues:
        for tissue in tissues:
            try:
                tissue = Tissue.objects.get(pk=tissue) #if it is selected from form
            except:
                print "Did not found tissue by pk."
                try:
                    tissue = Tissue.objects.get(name__iexact=tissue) # If it is inferred from file name.
                except Tissue.DoesNotExist as e:
                    messages.add_message(request, messages.ERROR, ugettext("%s: %s" % (str(e)[:-1], tissue)))
                    return redirect('/expressions/signature/add/')

            signature.tissues.add(tissue)
        print "Tissues:", signature.tissues.all()

        header = {}
        for index, column in enumerate(data[0].split('\t')):
            if "DR" in column: column = "exp"
            elif "AL" in column: column = "ctr"

            header[column.lower().replace('gene symbol', 'symbol')\
                                 .replace('gene_symbol', 'symbol')\
                                 .replace(' ', '_')\
                                 .replace('platform_cloneid', 'seq_id')\
                                 .replace('ensembl_gene', 'seq_id')] = index # WTF is this?


        #num_lines = len(data); counter = 0
        print len(data[1:])
        for line in data[1:]:
            #print(line)
            #print(header)
            try:
                #print("Trying")
                # For effect size
                ctr_values = []
                exp_values = []

                #counter += 1
                if not line: continue
                columns = line.split('\t')
                if len(columns) < len(header): continue #break #
                seq_id = columns[header['seq_id']]
                symbol = columns[header['symbol']]
                if symbol == "None": symbol = None
                ctr = float(columns[header['ctr']])
                exp = float(columns[header['exp']])
                if "ratio" in header:
                    ratio = float(columns[header['ratio']])
                    if ratio < 1:
                        fold_change = -(1/ratio)
                    else:
                        fold_change = ratio
                else:
                    ratio = float(columns[header['fold_change']]) # 2**exp/2**ctr
                if ratio < 1:
                    fold_change = -(1/ratio)
                else:
                    fold_change = ratio
                # Calculating effect size:
                for k,v  in header.items():
                    if k.startswith('ctr') and k != 'ctr':
                        ctr_values.append(float(columns[v]))
                    elif k.startswith('exp') and k != 'exp':
                        exp_values.append(float(columns[v]))
#                if exp_values and exp_values != ctr_values:
#                    #print exp_values
#                    es = effect_size(exp_values, ctr_values)
#                else:
                es = None
#                if 'pvalue' in header:
#                    pvalue = columns[header['p_value']]
#                else:
                if exp_values != ctr_values:
                    pvalue = t_two_sample(ctr_values, exp_values)[1]
                else: pvalue = 1

                transcript = Transcript(seq_id=seq_id, symbol=symbol, ratio=ratio, fold_change=fold_change, pvalue=pvalue, effect_size=es)

                transcript.save()
                #print(transcript.id, transcript.symbol, transcript.ratio)
                expression = Expression.objects.create(
                    signature=signature,
                    transcript=transcript,
                    exp=exp, ctr=ctr,
                    ratio=ratio,
                    fold_change=fold_change,
                    pvalue=pvalue,
                effect_size=es)
                #print expression
            except ValueError as e:
                print e, symbol, seq_id, fold_change, pvalue, ctr, exp
                #break

        #print "Counter=%s; Number of lines:%s" % (counter, num_lines)
        #if counter == num_lines:
        msg = "Successfully integrated signature: %s" % signature.name
        msg_type = messages.SUCCESS
        #else:
        #    msg = "File upload failed."
        #    msg_type = messages.ERROR
        messages.add_message(request, msg_type, ugettext(msg))
        redirect('/expressions/signatures/')

    ctx = {'form': form, 'action': 'Add'}
    return render_to_response('expressions/signature_form.html', ctx,
        context_instance=RequestContext(request))