Exemple #1
0
def cellHarmony(species,
                platform,
                query_exp_file,
                exp_output,
                customMarkers=False,
                useMulti=False,
                fl=None,
                customLabels=None):
    """ Prepare pre-aligned result files in a pre-defined format for cellHarmony post-aligment
    differential and visualization analyses """

    customLabels = fl.Labels()
    reference_exp_file = customMarkers  ### pre-formatted from Seurat or other outputs

    export_directory = os.path.abspath(os.path.join(query_exp_file, os.pardir))
    if 'ExpressionInput' in query_exp_file:
        ### Change to the root directory above ExpressionINput
        export_directory = os.path.abspath(
            os.path.join(export_directory, os.pardir))
    dataset_name = string.replace(
        string.split(query_exp_file, '/')[-1][:-4], 'exp.', '')
    try:
        os.mkdir(export_directory + '/cellHarmony/')
    except:
        pass
    try:
        os.mkdir(export_directory + '/cellHarmony/CellClassification/')
    except:
        pass
    try:
        os.mkdir(export_directory + '/cellHarmony/OtherFiles/')
    except:
        pass

    ### Get the query and reference cells, dataset names
    refererence_cells, query_cells, reference_dataset, query_dataset = importCelltoClusterAnnotations(
        customLabels
    )  ### Get the reference and query cells in their respective order

    ### copy and re-name the input expression file to the output cellHarmony directory
    if len(reference_dataset) > 0 and len(query_dataset) > 0:
        target_exp_dir = export_directory + '/cellHarmony/exp.' + reference_dataset + '__' + query_dataset + '-AllCells.txt'
    else:
        target_exp_dir = export_directory + '/cellHarmony/exp.cellHarmony-reference__Query-AllCells.txt'
        reference_dataset = 'cellHarmony-reference'
    shutil.copy(query_exp_file, target_exp_dir)

    ### filter and export the heatmap with just reference cells
    cell_cluster_order = simpleHeaderImport(reference_exp_file)
    filtered_reference_cells = []
    filtered_query_cells_db = {}
    filtered_query_cells = []
    representative_refcluster_cell = {}
    for cell_id in cell_cluster_order:
        if cell_id in refererence_cells:
            filtered_reference_cells.append(cell_id)
            cluster_label = refererence_cells[cell_id].Label()
            ### Identifies where to place each query cell
            try:
                representative_refcluster_cell[cluster_label].append(cell_id)
            except:
                representative_refcluster_cell[cluster_label] = [cell_id]
        elif cell_id in query_cells:
            filtered_query_cells_db[cell_id] = query_cells[cell_id]
            filtered_query_cells.append(cell_id)

    #reference_output_file = export.findParentDir(reference_exp_file)+'/'+reference_dataset+'.txt'
    reference_output_file = export_directory + '/cellHarmony/OtherFiles/' + reference_dataset + '.txt'
    reference_output_file2 = export_directory + '/cellHarmony/exp.' + reference_dataset + '__' + query_dataset + '-Reference.txt'
    query_output_file = export_directory + '/' + query_dataset + '.txt'
    ### Write out separate refernece and query files
    from import_scripts import sampleIndexSelection
    sampleIndexSelection.filterFile(reference_exp_file,
                                    reference_output_file,
                                    ['row_clusters-flat'] +
                                    filtered_reference_cells,
                                    force=True)
    sampleIndexSelection.filterFile(target_exp_dir,
                                    query_output_file,
                                    filtered_query_cells,
                                    force=True)
    shutil.copy(reference_output_file, reference_output_file2)

    ### export the CellClassification file
    output_classification_file = export_directory + '/cellHarmony/CellClassification/CellClassification.txt'
    exportCellClassifications(output_classification_file,
                              filtered_query_cells_db, filtered_query_cells,
                              representative_refcluster_cell)
    labels_file = export_directory + '/labels.txt'
    exportLabels(labels_file, filtered_reference_cells, refererence_cells)
    fl.setLabels(labels_file)

    print 'Files formatted for cellHarmony... running differential expression analyses'
    try:
        print reference_output_file
        print query_output_file
        print output_classification_file
        LineageProfilerIterate.harmonizeClassifiedSamples(
            species,
            reference_output_file,
            query_output_file,
            output_classification_file,
            fl=fl)
    except:
        print '\nFAILED TO COMPLETE THE FULL CELLHARMONY ANALYSIS (SEE LOG FILE)...'
        print traceback.format_exc()

    return True
    return commonkeys,count

if __name__ == '__main__':

    import getopt
  

    
    ################  Comand-line arguments ################
    if len(sys.argv[1:])<=1:  ### Indicates that there are insufficient number of command-line arguments
        print "Warning! Insufficient command line flags supplied."
        sys.exit()
    else:
        analysisType = []

        options, remainder = getopt.getopt(sys.argv[1:],'', ['Guidefile=','PSIfile='])
        for opt, arg in options:
            if opt == '--Guidefile': Guidefile=arg
            elif opt =='--PSIfile':PSIfile=arg
           
            else:
                print "Warning! Command-line argument: %s not recognized. Exiting..." % opt; sys.exit()
       
#filename="/Users/meenakshi/Documents/leucegene/ICGS/Clustering-exp.Hs_RNASeq_top_alt_junctions367-Leucegene-75p_no149-Guide1 TRAK1&ENSG00000182606&I1.1_42075542-E2.1__E-hierarchical_cosine_correlation.txt"          
#PSIfile="/Users/meenakshi/Documents/leucegene/ExpressionInput/exp.Hs_RNASeq_top_alt_junctions-PSI_EventAnnotation-367-Leucegene-75p-unique-filtered-filtered.txt"
#keylabel="/Users/meenakshi/Documents/leucegene/ExpressionInput/exp.round2_glmfilteredKmeans_label.txt"
    header=header_file(Guidefile)
    output_file=PSIfile[:-4]+"-filtered.txt"
    sampleIndexSelection.filterFile(PSIfile,output_file,header)
    commonkeys,count=FindCorrelations(Guidefile,output_file)
    DepleteSplicingevents(commonkeys,output_file,count)
def computeOnSexTranscripts(fn):
    """ Import a flat single-cell expression matrix """
    female = [
        'TSIX', 'XIST', 'ENSG00000229807', 'ENSMUSG00000086503',
        'ENSG00000270641', 'ENSMUSG00000085715'
    ]
    male = [
        'EIF2S3Y', 'EDX3Y', 'UTY', 'ENSMUSG00000069049', 'ENSMUSG00000069049',
        'ENSG00000067048', 'ENSMUSG00000069045', 'ENSMUSP00000070012',
        'ENSG00000183878'
    ]
    informative_genes = male + female
    female_data = []
    male_data = []
    firstRow = True
    for line in open(fn, 'rU').xreadlines():
        data = cleanUpLine(line)
        t = string.split(data, '\t')
        if firstRow:
            header = t[1:]
            firstRow = False
        else:
            gene = t[0]
            gene_upper = string.upper(gene)
            if gene_upper in informative_genes:
                values = map(float, t[1:])
                if gene_upper in male:
                    male_data.append(values)
                else:
                    female_data.append(values)

    male_data = np.array(male_data)
    female_data = np.array(female_data)
    male_data = np.average(male_data, axis=0)
    female_data = np.average(female_data, axis=0)
    ratios = [i - j for i, j in zip(female_data, male_data)]

    female_count = 0
    male_count = 0
    uknown = 0
    i = 0
    females = []
    males = []
    unknowns = []
    for ratio in ratios:
        if ratio > 0:
            female_count += 1
            females.append(header[i])
        elif ratio < 0:
            male_count += 1
            males.append(header[i])
        else:
            uknown += 1
            unknowns.append(header[i])
        i += 1

    male_ratio = 100 * (male_count * 1.00) / (
        (male_count * 1.00) + (female_count * 1.00) + uknown * 1.00)
    female_ratio = 100 * (female_count * 1.00) / (
        (male_count * 1.00) + (female_count * 1.00) + uknown * 1.00)
    uknown_ratio = 100 * (uknown * 1.00) / (
        (male_count * 1.00) + (female_count * 1.00) + uknown * 1.00)

    print 'Male cell percentage:', male_ratio
    print 'Female cell percentage:', female_ratio
    print 'Unknown cell percentage:', uknown_ratio

    from import_scripts import sampleIndexSelection
    sampleIndexSelection.filterFile(fn,
                                    fn[:-4] + '-female.txt',
                                    ['row_clusters-flat'] + females,
                                    force=True)
    sampleIndexSelection.filterFile(fn,
                                    fn[:-4] + '-male.txt',
                                    ['row_clusters-flat'] + males,
                                    force=True)

    export_object = open(fn[:-4] + '-groups.txt', 'w')

    def writeData(header, group):
        export_object.write(header + '\t' + group + '\n')

    map(lambda x: writeData(x, 'male'), males)
    map(lambda x: writeData(x, 'female'), females)
    map(lambda x: writeData(x, 'unknown'), unknowns)
    export_object.close()
def removeMarkerFinderDoublets(heatmap_file, diff=1):
    matrix, column_header, row_header, dataset_name, group_db, priorColumnClusters, priorRowClusters = clustering.remoteImportData(
        heatmap_file)

    priorRowClusters.reverse()
    if len(priorColumnClusters) == 0:
        for c in column_header:
            cluster = string.split(c, ':')[0]
            priorColumnClusters.append(cluster)
        for r in row_header:
            cluster = string.split(r, ':')[0]
            priorRowClusters.append(cluster)

    import collections
    cluster_db = collections.OrderedDict()
    i = 0
    for cluster in priorRowClusters:
        try:
            cluster_db[cluster].append(matrix[i])
        except:
            cluster_db[cluster] = [matrix[i]]
        i += 1

    transposed_data_matrix = []
    clusters = []
    for cluster in cluster_db:
        cluster_cell_means = numpy.mean(cluster_db[cluster], axis=0)
        cluster_db[cluster] = cluster_cell_means
        transposed_data_matrix.append(cluster_cell_means)
        if cluster not in clusters:
            clusters.append(cluster)
    transposed_data_matrix = zip(*transposed_data_matrix)

    i = 0
    cell_max_scores = []
    cell_max_score_db = collections.OrderedDict()

    for cell_scores in transposed_data_matrix:
        cluster = priorColumnClusters[i]
        cell = column_header[i]
        ci = clusters.index(cluster)
        #print ci, cell, cluster, cell_scores;sys.exit()
        cell_state_score = cell_scores[
            ci]  ### This is the score for that cell for it's assigned MarkerFinder cluster
        alternate_state_scores = []
        for score in cell_scores:
            if score != cell_state_score:
                alternate_state_scores.append(score)
        alt_max_score = max(alternate_state_scores)
        alt_sum_score = sum(alternate_state_scores)
        cell_max_scores.append(
            [cell_state_score, alt_max_score, alt_sum_score]
        )  ### max and secondary max score - max for the cell-state should be greater than secondary max
        try:
            cell_max_score_db[cluster].append(
                ([cell_state_score, alt_max_score, alt_sum_score]))
        except:
            cell_max_score_db[cluster] = [[
                cell_state_score, alt_max_score, alt_sum_score
            ]]
        i += 1

    for cluster in cell_max_score_db:
        cluster_cell_means = numpy.median(cell_max_score_db[cluster], axis=0)
        cell_max_score_db[
            cluster] = cluster_cell_means  ### This is the cell-state mean score for all cells in that cluster and the alternative max mean score (difference gives you the threshold for detecting double)
    i = 0
    #print len(cell_max_scores)
    keep = ['row_clusters-flat']
    keep_alt = ['row_clusters-flat']
    remove = ['row_clusters-flat']
    remove_alt = ['row_clusters-flat']
    min_val = 1000
    for (cell_score, alt_score, alt_sum) in cell_max_scores:
        cluster = priorColumnClusters[i]
        cell = column_header[i]
        ref_max, ref_alt, ref_sum = cell_max_score_db[cluster]
        ci = clusters.index(cluster)
        ref_diff = math.pow(2, (ref_max - ref_alt)) * diff  #1.1
        ref_alt = math.pow(2, (ref_alt))
        cell_diff = math.pow(2, (cell_score - alt_score))
        cell_score = math.pow(2, cell_score)
        if cell_diff < min_val: min_val = cell_diff
        if cell_diff > ref_diff and cell_diff > diff:  #cell_score cutoff removes some, but cell_diff is more crucial
            #if alt_sum<cell_score:
            assignment = 0  #1.2
            keep.append(cell)
            try:
                keep_alt.append(string.split(cell,
                                             ':')[1])  ### if prefix added
            except Exception:
                keep_alt.append(cell)
        else:
            remove.append(cell)
            try:
                remove_alt.append(string.split(cell, ':')[1])
            except Exception:
                remove_alt.append(cell)
            assignment = 1

        #print assignment
        i += 1
    #print min_val
    print 'Number of cells to keep:', len(keep), 'out of', len(column_header)
    from import_scripts import sampleIndexSelection
    input_file = heatmap_file
    output_file = heatmap_file[:-4] + '-Singlets.txt'
    try:
        sampleIndexSelection.filterFile(input_file, output_file, keep)
    except:
        sampleIndexSelection.filterFile(input_file, output_file, keep_alt)

    output_file = heatmap_file[:-4] + '-Multiplets.txt'
    try:
        sampleIndexSelection.filterFile(input_file, output_file, remove)
    except:
        sampleIndexSelection.filterFile(input_file, output_file, remove_alt)