Beispiel #1
0
    def __init__(self, sentence, window_size):

        self.operands = []
        self.nouns = []
        self.sentence = sentence.lower()
        self.autocorrect()
        self.tags = dict(
        )  #This will store the key as the word in the sentence and the appropriate tag as the value. less_than  is tagged as operator
        self.transformedSentence = None
        self.window = int(window_size)
        self.operandDictionaryFile = 'operandDictionary.txt'
        self.operatorDictionaryFile = 'operatorDictionary.txt'
        self.words = []
        self.operandDictionary = []
        self.operatorDictionary = [
        ]  # not used anymore, not needed. Make the code which uses this inactive, but don't remove
        self.operatorMapping = [
        ]  # this is a list of tuples e.g. ('plus', '+'), ('add', '+'), ('times', '*')
        self.operators = []  # list of operators that we support
        self.tokens = []  # this is the final list of tokens after processing
        self.phrases_k_size = []
        self.tagged_operands = []
        self.tagged_operators = []
        self.phrases_1_size_operator = []
        self.cosineSim = CosineSimilarity()
        self.initializeDictionary()
        self.initializeSentence()

        #self.initializePhrasesOfSizeK()
        #self.initializePhrasesOfSize1Operator()

        self.merger = []
def testRandUniformInput():
	MACROSTATES = enum("E-DHF-NADPH", "E-NADPH", "E-OPEN", "E-THF", "E-THF-NADPX", "TS");
	RESIDUES = enum('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y');

	# only looking at MACROSTATE.TS
	# only optimizing backrub temperature and steepness
	ensembleSizes = numpy.array([50]);
	backrubTemps = numpy.array([0.3, 0.6, 0.9, 1.2, 1.5, 1.8]);
	boltzmannTemps = numpy.array([0, -1, 1, 5.0]);
	steepnessRange = numpy.array([0.5, 5]);
	minWeights = numpy.array([0, 0, 0, 0, 0, 0]);
	maxWeights = numpy.array([1, 1, 0, 1, 1, 1]);

	print("Initializing objects\n");

	targetFreqs = "C:\\Users\\Candy\\SkyDrive\\Documents\\rotation 2\\ecDHFR_openseq_bacterial_representative_final_align_trim.fasta";
	targetFreqsAlt = "C:\\Users\\Candy_000\\SkyDrive\\Documents\\rotation 2\\ecDHFR_openseq_bacterial_representative_final_align_trim.fasta";
	data = "C:\\Users\\Candy\\SkyDrive\\Documents\\rotation 2\\DHFR_MSD_M20loop\\DHFR_MSD_M20loop_repeat1.tsv";
	dataAlt = "C:\\Users\\Candy_000\\SkyDrive\\Documents\\rotation 2\\DHFR_MSD_M20loop\\DHFR_MSD_M20loop_repeat1.tsv";
	
	optimizer = Optimizer(MACROSTATES);

	# slightly different paths on my two computers
	try:
		optimizer.readTargetFrequencies(targetFreqs);	
		optimizer.readData(data);
	except:
		optimizer.readTargetFrequencies(targetFreqsAlt);	
		optimizer.readData(dataAlt);

	# make energies uniform
	for model in optimizer.models:
		optimizer.models[model].macrostateResidueEnergies = numpy.ones_like(optimizer.models[model].macrostateResidueEnergies);

	search = CuckooSearch(optimizer.models, JensenShannonDistance(optimizer.targetFrequencies), False, 1, 1, 0.25);
	search.setMaxIterations(1);
	search.setParamBounds(ensembleSizes, backrubTemps, boltzmannTemps, steepnessRange, minWeights, maxWeights);
	search.setSearchParameters(False, False, False, False, numpy.array([False, False, False, False, False, False]));
	optimizer.useAlgorithm(search);

	outfile = open("uniform energy similarities.txt", 'w');
	optimizer.optimize();
	outfile.write("JSD: {:.4f}\n".format(optimizer.getBestParameters()['match']));

	search.setSimilarityMeasure(CosineSimilarity(optimizer.targetFrequencies));
	optimizer.optimize();
	outfile.write("Cosine similarity: {:.4f}\n".format(optimizer.getBestParameters()['match']));

	search.setSimilarityMeasure(KLDivergence(optimizer.targetFrequencies));
	optimizer.optimize();
	outfile.write("K-L divergence: {:.4f}\n".format(optimizer.getBestParameters()['match']));

	search.setSimilarityMeasure(EntropyWeightsMixedSimilarity(CosineSimilarity(), JensenShannonDistance(), optimizer.targetFrequencies));
	optimizer.optimize();
	outfile.write("Weighted mixed similarity: {:.4f}\n".format(optimizer.getBestParameters()['match']));
	outfile.close();
	return None;
    def __init__(self,sentence,window_size):

        self.operands=[]
        self.nouns=[]
        self.sentence=sentence.lower()
        self.autocorrect()
        self.tags=dict()   #This will store the key as the word in the sentence and the appropriate tag as the value. less_than  is tagged as operator
        self.transformedSentence=None
        self.window=int(window_size)
        self.operandDictionaryFile= 'operandDictionary.txt'
        self.operatorDictionaryFile='operatorDictionary.txt'
        self.words=[]
        self.operandDictionary=[] 
        self.operatorDictionary=[] # not used anymore, not needed. Make the code which uses this inactive, but don't remove 
        self.operatorMapping=[] # this is a list of tuples e.g. ('plus', '+'), ('add', '+'), ('times', '*')
        self.operators = [] # list of operators that we support
        self.tokens = [] # this is the final list of tokens after processing
        self.phrases_k_size=[]
        self.tagged_operands=[]
        self.tagged_operators=[]
        self.phrases_1_size_operator=[]
        self.cosineSim=CosineSimilarity()
        self.initializeDictionary()
        self.initializeSentence()

        #self.initializePhrasesOfSizeK()
        #self.initializePhrasesOfSize1Operator()

        self.merger=[]
def DHFRcomparemeasures(similarity:int):
	MACROSTATES = enum("E-DHF-NADPH", "E-NADPH", "E-OPEN", "E-THF", "E-THF-NADPX", "TS");
	RESIDUES = enum('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y');

	ensembleSizes = numpy.array([20, 50]);
	backrubTemps = numpy.array([0.3, 0.6, 0.9, 1.2, 1.5, 1.8]);
	boltzmannTemps = numpy.array([0, -1, 1, 5.0]);
	steepnessRange = numpy.array([0.5, 5]);
	minWeights = numpy.array([0, 0, 0, 0, 0, 0]);
	maxWeights = numpy.array([1, 1, 0, 1, 1, 1]);

	data = "/netapp/home/tianjiao.zhang/data/DHFR_MSD_M20loop_repeat1.tsv";
	targetFreqs = "/netapp/home/tianjiao.zhang/data/ecDHFR_openseq_bacterial_representative_final_align_trim.fasta";
	optimizer = Optimizer(MACROSTATES);
	optimizer.readTargetFrequencies(targetFreqs);
	optimizer.readData(data);

	measure = "";
	if similarity == 0:
		search = CuckooSearch(optimizer.models, JensenShannonDistance(optimizer.targetFrequencies), False, 64, 1, 0.25);
		measure = " JSD";
	elif similarity == 1:
		search = CuckooSearch(optimizer.models, CosineSimilarity(optimizer.targetFrequencies), False, 64, 1, 0.25);
		measure = " Cos";
	elif similarity == 2:
		search = CuckooSearch(optimizer.models, KLDivergence(optimizer.targetFrequencies), False, 64, 1, 0.25);
		measure = " KLD";
	elif similarity == 3:
		search = CuckooSearch(optimizer.models, EntropyWeightsMixedSimilarity(CosineSimilarity(), JensenShannonDistance(), optimizer.targetFrequencies), False, 64, 1, 0.25);
		measure = " Mix"
	elif similarity == 4:
		search = CuckooSearch(optimizer.models, EntropyWeightedSimilarity(JensenShannonDistance(), optimizer.targetFrequencies), False, 64, 1, 0.25);
		measure = "Weighted JSD";
	else:
		search = CuckooSearch(optimizer.models, Chi2Kernel(optimizer.targetFrequencies), False, 64, 1, 0.25);
		measure = "Chi2 kernel";
	search.setMaxIterations(2048);
	search.setParamBounds(ensembleSizes, backrubTemps, boltzmannTemps, steepnessRange, minWeights, maxWeights);
	search.setSearchParameters(True, True, True, True, numpy.array([True, True, False, True, True, True]));
	optimizer.useAlgorithm(search);
	optimizer.optimize();

	name = "DHFR compare measures " + measure + " " + datetime.now().strftime('%Y%m%d%H%M');
	optimizer.writeFrequenciesToFASTA(optimizer.getBestFrequencies(), name + ".fasta", 3);
	optimizer.writeBestParamsToText(name + ".txt");
Beispiel #5
0
 def sim(self, d, q, method):
     if method == "Cosine":
         print("\n\nCosine\n==============\n")
         technique = CosineSimilarity()
         return technique.sim(d, q)
     elif method == "Jaccard":
         print("\n\nJaccard\n==============\n")
         technique = JaccardSimilarity()
         return technique.sim(d, q)
     elif method == "Euclidean":
         print("\n\nEuclidean\n==============\n")
         technique = EuclideanSimilarity()
         return technique.sim(d, q)
     elif method == "Manhattan":
         print("\n\nManhattan\n==============\n")
         technique = ManhattanSimilarity()
         return technique.sim(d, q)
     else:
         print("Default")
class SentenceProcessor:

    def __init__(self,sentence,window_size):

        self.operands=[]
        self.nouns=[]
        self.sentence=sentence.lower()
        self.autocorrect()
        self.tags=dict()   #This will store the key as the word in the sentence and the appropriate tag as the value. less_than  is tagged as operator
        self.transformedSentence=None
        self.window=int(window_size)
        self.operandDictionaryFile= 'operandDictionary.txt'
        self.operatorDictionaryFile='operatorDictionary.txt'
        self.words=[]
        self.operandDictionary=[] 
        self.operatorDictionary=[] # not used anymore, not needed. Make the code which uses this inactive, but don't remove 
        self.operatorMapping=[] # this is a list of tuples e.g. ('plus', '+'), ('add', '+'), ('times', '*')
        self.operators = [] # list of operators that we support
        self.tokens = [] # this is the final list of tokens after processing
        self.phrases_k_size=[]
        self.tagged_operands=[]
        self.tagged_operators=[]
        self.phrases_1_size_operator=[]
        self.cosineSim=CosineSimilarity()
        self.initializeDictionary()
        self.initializeSentence()

        #self.initializePhrasesOfSizeK()
        #self.initializePhrasesOfSize1Operator()

        self.merger=[]

        # This is a list and will store the transformed sentence made entirely of keys from the tags and relative position as in
        # the original sentence
    def autocorrect(self):
        line=''
        for word in self.sentence.split(' '):
	    if word.isdigit():
		line=line+' '+word
	    else:
            	line=line+' '+spell(word)
        self.sentence=line

    def initializeNouns(self):
        self.tokens = nltk.word_tokenize(' '.join(self.words))
        self.tagged = nltk.pos_tag(self.words)
        self.nouns = [word for word,pos in self.tagged \
                      if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS')]
        self.nouns = [x.lower() for x in self.nouns]

    #USED
    def initializeDictionary(self):
        with open(self.operandDictionaryFile, "r") as ins:
            array = []
            for line in ins.readlines():
                splits=line.split("\n")
                splits=splits[0:-1]   #removing the extra '' blank character getting added
                self.operandDictionary.append(splits)
                self.operands.append(splits[0])

        array = []
        with open(self.operatorDictionaryFile,"r") as ins:
            for line in ins.readlines():
                #print line
                splits=line.split(",")
                self.operators.append(splits[-1][0:-1].strip())
                for _ in splits[0:-1]:
                    array.append((_.strip(), splits[-1][0:-1]))
        #print array
        self.operatorMapping = sorted(array, key=lambda x:-len(x[0]))
        print self.operators
        #print array


    def initializeSentence(self):
        print "original:", self.sentence
        # remove articles/stop words
        for _ in stopwords:
            self.sentence = re.sub(" "+_+" ", ' ', self.sentence)
            if (self.sentence.split(' ',1)[0] == _):
                self.sentence = self.sentence.split(' ',1)[1]
        
        print "after removing articles:", self.sentence
        
        # remove "replacements"
        for _ in replacements:
            self.sentence = re.sub(" "+_+" ", ' ', self.sentence)
        self.sentence = re.sub('is is is', 'is', self.sentence)
        self.sentence = re.sub('is is', 'is', self.sentence)

        print "after removing replacements:", self.sentence

        # replace text with actual operators
        for _ in self.operatorMapping:
           self.sentence = re.sub(_[0], " " + _[1] + " ", self.sentence)
        print "after replacing text with operators:", self.sentence

        # change <number><unit> to <number> e.g. 10meters --> 10
        self.sentence = re.sub('(\d+)[^ \d]*', ' \g<1>', self.sentence)
        print "after changing numbers:", self.sentence

        if self.sentence!=None:
            self.words=self.sentence.split()   #a list of words
            print "after splitting the processed sentence:", self.words

        # This is for merging the parts of string
        # e.g. we have all words in the sentence as a list at this moment.
        # after this step, we will have <part A> <operator> <part B>
        self.tokens = []
        n = len(self.words)
        i = 0
        token = ""
        while i<n:
            if self.words[i] in self.operators or self.words[i] in reserved_words:
                self.tokens.append(token.strip())
                token = ""
                self.tokens.append(self.words[i])
            else:
                token += " "+self.words[i]
            i += 1
        self.tokens.append(token)
        print "After merging the parts of sentence together:", self.tokens
        self.processTokens()
        print "After processing of tokens ",self.tokens
        if self.tokens!=None and len(self.tokens)>0 and self.tokens[0]!=None:
            self.expression=' '.join(self.tokens)
        print "The mathematical expression is ",self.expression


    def processTokens(self):
        isRange=False
        isOfType=False  #manages subtraction of, product of, i.e. sum of A and B types
        isOfTypeOp=''  #stores the operator being considered
        prevNoun=''
        for i in range(0,len(self.tokens)):
            token=self.tokens[i]
            if token not in self.operators and token not in reserved_words:
                if isRange: #i.e. we are talking about ranges
                    list=token.split('and')
                    operand1=''
                    operand2=''
                    if len(list)>=2:
                        operand1=list[0].strip()
                        operand2=list[1].strip()

                    if(operand1.isdigit()==True and operand2.isdigit()==True):
                        self.tokens[i-1]='in ['+operand1+','
                        self.tokens[i]=operand2+']'
                        isRange=False
                        continue
                    else:
                        operand1=self.operandMatching(operand1,0.7)
                        operand2=self.operandMatching(operand2,0.7)
                        self.tokens[i-1]=prevNoun.upper()+'{'+operand1+'} - '
                        self.tokens[i]=prevNoun.upper()+'{'+operand2+'}'
                        isRange=False
                        continue
                if isOfType==True:
                    list=token.split('and')
                    operand1=list[0]
                    operand2=list[1]
                    self.tokens[i-1]='('+operand1+' '+isOfTypeOp
                    self.tokens[i]=operand2+')'
                    isOfType=False
                    isOfTypeOp=''
                    continue

                operand=self.operandMatching(token,0.7)
                if operand!='':
                    self.tokens[i]=operand
                    continue
                if token.strip().isdigit():
                    continue
                prevNoun=token  #if everything fails, this means that this has to be a noun
                self.tokens[i]=''
            elif token=='[':
                isRange=True
            elif token=='++' or token=='--' or token=='//' or token=='**':
                isOfType=True
                isOfTypeOp=token[0:-1]
            elif token=='by':
                prevOperatorPos=self.prevOperator()
                if self.tokens[prevOperatorPos]=='>' or self.tokens[prevOperatorPos]=='>=':
                  self.tokens[i]='+'
                  self.tokens[prevOperatorPos]='='
                elif self.tokens[prevOperatorPos]=='<' or self.tokens[prevOperatorPos]=='<=':
                  self.tokens[i]='-'
                  self.tokens[prevOperatorPos]='='

    def prevOperator(self):
        operand=''
        pos=-1
        for i in range(0,len(self.tokens)):
            token=self.tokens[i]
            if token in self.operators:
                operand=token
                pos=i
        return pos

    def convertToSynset(self,phrase):
        list=phrase.strip().split()
        maxSize=10
        mat=[['' for x in range(len(list))] for y in range((maxSize))]

        for col in range(0,len(list)):
            li=wn.synsets(list[col])
            print li
            for row in range(0,maxSize):
                str=''
                if list[col].isdigit():
                    mat[row][col]=list[col]
                elif row<len(li) and col<len(mat[0]) and row<len(mat):
                    str=li[row].name()
                    posOfDot=str.index('.')
                    str=str[:posOfDot]
                    mat[row][col]=str
        # for r in range(0,len(mat)):
        #     for c in range(0,len(mat[0])):
        #         print mat[r][c]+' '
        #     print '\n'
        return mat

    def operandMatching(self,phrase,threshold):
        if len(phrase)<3:
            if len(phrase.strip().split(' '))>0:
                return ('_').join(phrase.strip().split(' '))
            return phrase
        maxMatch=0
        operand=''
        phrases=[[]]
        phrases=self.convertToSynset(phrase)  #2 D matrix of synonyms
        self.maxMatch=0
        self.threshold=0.8
        self.operand=''
        print "The phrases are ",phrases
        for dictionaryWordArr in self.operandDictionary:  #each line can have several similar meaning words
            text1=dictionaryWordArr[0].split('_')
            text1=' '.join(text1)
            text1=text1.lower()
            visited=[[False for x in range(len(phrases[0]))] for y in range((len(phrases)))]
            self.DFS(visited,phrases,0,0,'',text1,dictionaryWordArr[0],phrase)

        return self.operand

        #         phrase=' '.join(phrases[row])
        #         print "Phrase is ",phrase
        #         b=self.match1(phrase,text1)
        #         if b>threshold and b>maxMatch or (text1.lower()==phrase.lower()):
        #             maxMatch=b
        #             operand=dictionaryWordArr[0]
        # if operand!='':
 	    #     return operand
        # phrase=phrase.strip()
        # list=phrase.split(' ')
        # if list[0].isdigit():
        #     return phrase

        #return '_'.join(phrase.strip().split(' '))

    def check(self,phrase,text1,operan,originalStr):
        print "Tried matching ",phrase,text1
        if phrase=='':
            return
        b=self.match1(phrase,text1)
        if b>self.threshold and b>self.maxMatch or (text1.lower()==phrase.lower()):
            self.maxMatch=b
            self.operand=operan
        if self.operand!='':
            return self.operand
        originalStr=originalStr.strip()
        list=originalStr.split(' ')
        if list[0].isdigit():
            self.operand=phrase
            return self.operand
        if phrase!='':
            self.operand='_'.join(phrase.strip().split(' '))

        return self.operand

    def DFS(self,visited,phrases,row,col,phrase,text1,operand,originalStr):
        if len(originalStr)<3:
            return originalStr
        if col==len(visited[0]) and row<len(visited) and phrases!='':
            self.check(phrase,text1,operand,originalStr) #TODO
            return
        if col>=len(visited[0]) or row>=len(visited):
            return
        if visited[row][col]==True:
            return

        visited[row][col]=True
        phrase1=phrase+' '+phrases[row][col]
        self.DFS(visited,phrases,row,col+1,phrase1,text1,operand,originalStr)
        visited[row][col]=False
        self.DFS(visited,phrases,row+1,col,phrase,text1,operand,originalStr) #backtracking


    def match1(self,s1,s2):
        sim=0
        try:
            sim=self.cosineSim.cosine_sim(s1,s2)
        except:
            pass
        #print "similarity between", text1, text2, sim
        return sim




    def match(self,phrases,word_splits):
        if phrases==None or word_splits==None or len(phrases)==0 or len(word_splits)==0:
            return False
        text1=''
        text2=''
        for w1 in phrases:
            if len(w1)!=0 and w1!='[]':
                text1=w1+" "+text1
        for w2 in word_splits:
            if len(w2)!=0 and w2!='[]':
                text2=w2+" "+text2

        try:
            sim=self.cosineSim.cosine_sim(text1.lower(),text2.lower())
        except:
            print text1, text2
            sim=self.cosineSim.cosine_sim(text1.lower(),text2.lower())
            pass
        #print "similarity between", text1, text2, sim
        return sim



    #processes a single sentence
    def processSentence(self):

        #self.operatorTagging()
        self.operandTagging()
        self.merge()

    def initializePhrasesOfSizeK(self):
        for k in range(0,len(self.words)-self.window+1):  #possible starting points of the string
            phrase=self.words[k:k+int(self.window)]
            self.phrases_k_size.append(phrase)

    def initializePhrasesOfSize1perator(self):
        for k in range(0,len(self.words)):  #possible starting points of the string
            phrase=self.words[k:k+1]
            self.phrases_1_size_operator.append(phrase)

    def merge(self):
        self.merger=[]
        prevWasOperator=False
        prevWasOperand=False
        prevOperatorIndex=-1
        prevOperandIndex=-1
        prevLeftOutOperandIndex=-1
        prevLeftOutOperatorIndex=-1
        for i in range(0,len(self.words)):
            operand=self.tagged_operands[i]
            operator=self.tagged_operators[i]
            self.merger.append('')
            if operand=='' and operator=='': #check if any variable name associated, nouns are important
                if self.words[i].lower() in self.nouns:
                    self.merger[i]=self.words[i]

            if operand=='' and operator=='':
                if self.words[i].isdigit():
                    self.merger[i]=self.words[i]

            if operator=='' and operator=='':
                if self.words[i].lower()=='by':
                    if prevOperatorIndex>=0 and  self.merger[prevOperatorIndex]=='>' or self.merger[prevOperatorIndex]=='>=':
                        self.merger[i]='+'
                        self.merger[prevOperatorIndex]='='
                        prevOperatorIndex=i
                    elif prevOperatorIndex>=0 and self.merger[prevOperatorIndex]=='<' or self.merger[prevOperatorIndex]=='<=':
                        self.merger[i]='-'
                        self.merger[prevOperatorIndex]='='
                        prevOperatorIndex=i


            if prevWasOperand==False and prevWasOperator==False:
                if operand!='' and len(operand)>=1:
                    self.merger[i]=operand
                    prevWasOperand=True
                    prevWasOperator=False
                    prevOperandIndex=i

            if prevWasOperand:
                if operator!='' and len(operator)>=1:
                    if prevLeftOutOperandIndex>=0:
                        self.merger[prevLeftOutOperandIndex]=operator
                        self.merger[i]=self.tagged_operands[prevLeftOutOperandIndex]
                        prevWasOperator=False
                        prevWasOperand=True
                        prevOperatorIndex=prevLeftOutOperandIndex
                        prevOperandIndex=i
                        prevLeftOutOperandIndex=-1
                    else:
                        self.merger[i]=operator
                        prevWasOperator=True
                        prevWasOperand=False
                        prevOperatorIndex=i
                if operand!='' and len(operand)>=1 and not (prevOperandIndex>=0 and operand==self.merger[prevOperandIndex]):
                    prevLeftOutOperandIndex=i #this operand has been left out, as we are only looking for operator right now, also check that this should not be a repeating operand
            if prevWasOperator:
                if operand!='' and len(operand)>=1:
                    if prevLeftOutOperatorIndex>=0:
                        self.merger[prevLeftOutOperatorIndex]=operand
                        self.merger[i]=self.tagged_operators[prevLeftOutOperatorIndex]
                        prevWasOperator=True
                        prevWasOperand=False
                        prevOperatorIndex=prevLeftOutOperandIndex
                        prevOperandIndex=i
                        prevLeftOutOperandIndex=-1
                    else:
                        self.merger[i]=operand
                        prevWasOperator=False
                        prevWasOperand=True
                        prevOperandIndex=i
                if operator!='' and len(operator)>=1 and not (prevOperatorIndex>=0 and operator==self.merger[prevOperatorIndex]):
                    prevLeftOutOperatorIndex=i #this operand has been left out, as we are only looking for operator right now, also check that this should not be a repeating operand

        print self.merger #all singly occurring words are nouns, merge all before and after any operator
        self.convertToScietific()
        print self.expression

    def convertToScietific(self):
        self.expression=''
        prevWasOperator=False
        prevWasOperand=False
        prevWasNoun=False
        for i in range(0,len(self.merger)):
            word=self.merger[i]
            if word in self.operands:   #means its an operand
                if prevWasOperand==False:
                    if prevWasNoun==True:
                        self.expression=self.expression+')'
                        prevWasNoun=False
                    self.expression=self.expression+' '+word
                    prevWasOperand=True
                    prevWasOperator=False
            elif word in self.operators:
                if prevWasOperator==False:
                    if prevWasNoun==True:
                        self.expression+=')'
                        prevWasNoun=False
                    self.expression+=' '+word
                    prevWasOperator=True
                    prevWasOperand=False
            elif word.lower() in self.nouns:
                if prevWasNoun==False:  #we can add a bracket and start writing the name
                    self.expression+='('+word.upper()
                    prevWasNoun=True
                elif prevWasNoun==True:
                    self.expression+=' '+word.upper()
                    prevWasNoun=True
            else:
                self.expression+=' '+word
        if prevWasNoun==True:
            self.expression+=')'



    #TODO Avinash
    def operandTagging(self):
        print "Start the operand tagging"
        for i in range(len(self.words)):
            if self.words[i].isdigit():
                self.tagged_operands.append(self.words[i])
            else:
                self.tagged_operands.append('')
        for i in range(len(self.phrases_k_size)):  #k size phrases in a sentence
            phrase=self.phrases_k_size[i]
            phrase=' '.join(phrase)
            threshold=0.9
            maxMatch=0
            for dictionaryWordArr in self.operandDictionary:  #each line can have several similar meaning words
                text1=dictionaryWordArr[0].split('_')
                text1=' '.join(text1)
                b=self.match(phrase,text1)
                if b>threshold and b>maxMatch:
                    maxMatch=b
                    for i1 in range(i,i+self.window):
                        if self.tagged_operands[i1].isdigit()==False:
                            self.tagged_operands[i1]=dictionaryWordArr[0]
                            break
        print "Tagged operand array ",self.tagged_operands
        print self.words

    #TODO Bhavesh. Not used
    def operatorTagging(self):
        print "Start the operator tagging "# plus,add,sum,addition:+
        for i in range(len(self.words)):
            self.tagged_operators.append('')

        for i in range(len(self.phrases_k_size)):
            operatorPhrase=self.phrases_k_size[i][0:-1]  #this will give me the phrase. greater than the
            self.tagged_operators.append('')
            threshold=0.7
            maxMatch=0
            for key in self.operatorMapping: #we want to avoid the last character
                text=" ".join(str(x) for x in operatorPhrase)
                key1=key[0]
                b=self.match(key1,text)
                if b>=threshold and b>maxMatch:
                    maxMatch=b
                    self.tagged_operators[i]=key[-1]
        print "Tagged operator array ",self.tagged_operators
        print self.words
def repeatTest():
	print("Hello!\n");
	MACROSTATES = enum("E-DHF-NADPH", "E-NADPH", "E-OPEN", "E-THF", "E-THF-NADPX", "TS");
	RESIDUES = enum('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y');

	ensembleSizes = numpy.array([50]);
	backrubTemps = numpy.array([0.3, 0.6, 0.9, 1.2, 1.5, 1.8]);
	boltzmannTemps = numpy.array([0, -1, 1, 5.0]);
	steepnessRange = numpy.array([0.5, 5]);
	minWeights = numpy.array([0, 0, 0, 0, 0, 0]);
	maxWeights = numpy.array([1, 1, 0, 1, 1, 1]);

	print("Initializing objects\n");

	targetFreqs = "C:\\Users\\Candy\\SkyDrive\\Documents\\rotation 2\\ecDHFR_openseq_bacterial_representative_final_align_trim.fasta";
	targetFreqsAlt = "C:\\Users\\Candy_000\\SkyDrive\\Documents\\rotation 2\\ecDHFR_openseq_bacterial_representative_final_align_trim.fasta";
	data = "C:\\Users\\Candy\\SkyDrive\\Documents\\rotation 2\\DHFR_MSD_M20loop\\DHFR_MSD_M20loop_repeat1.tsv";
	dataAlt = "C:\\Users\\Candy_000\\SkyDrive\\Documents\\rotation 2\\DHFR_MSD_M20loop\\DHFR_MSD_M20loop_repeat1.tsv";
	dataMicro = "C:\\Users\\Candy\\SkyDrive\\Documents\\rotation 2\\20160120_M20_enumeration_scores\\20160120_M20_enumeration_scores.tsv";
	dataMicroAlt = "C:\\Users\\Candy_000\\SkyDrive\\Documents\\rotation 2\\20160120_M20_enumeration_scores\\20160120_M20_enumeration_scores.tsv";

	optimizer = Optimizer(MACROSTATES);

	# slightly different paths on my two computers
	try:
		optimizer.readTargetFrequencies(targetFreqs);	
		optimizer.readData(data);
	except:
		optimizer.readTargetFrequencies(targetFreqsAlt);	
		optimizer.readData(dataAlt);

	print("Files read in");

	for i in range(32):
		search = CuckooSearch(optimizer.models, JensenShannonDistance(optimizer.targetFrequencies), False, 8, 1, 0.25);
		search.setMaxIterations(16);
		search.suppressOutputs = True;
		search.setParamBounds(ensembleSizes, backrubTemps, boltzmannTemps, steepnessRange, minWeights, maxWeights);
		search.setSearchParameters(False, True, True, True, numpy.array([True, True, False, True, True, True]));
		optimizer.useAlgorithm(search);

		print("\nJS Dist");
		#search.setSimilarityMeasure(JensenShannonDistance(optimizer.targetFrequencies));
		optimizer.optimize();
	
		params = optimizer.getBestParameters();
		print(params['match']);
		print(optimizer.verifyFoundParams(params['ensembleSize'], params['backrubTemp'], params['boltzmannTemp'], params['steepness'], params['weights']));

		search = CuckooSearch(optimizer.models, CosineSimilarity(optimizer.targetFrequencies), False, 8, 1, 0.25);
		search.setMaxIterations(16);
		search.suppressOutputs = True;
		search.setParamBounds(ensembleSizes, backrubTemps, boltzmannTemps, steepnessRange, minWeights, maxWeights);
		search.setSearchParameters(False, True, True, True, numpy.array([True, True, False, True, True, True]));
		optimizer.useAlgorithm(search);

		print("\nCosine")
		optimizer.optimize();
		params = optimizer.getBestParameters();
		print(params['match']);
		print(optimizer.verifyFoundParams(params['ensembleSize'], params['backrubTemp'], params['boltzmannTemp'], params['steepness'], params['weights']));

	return None;
Beispiel #8
0
targetFreqs = "/netapp/home/tianjiao.zhang/data/ecDHFR_openseq_bacterial_representative_final_align_trim.fasta";

dataAlt = "C:\\users\\candy\\skydrive\\documents\\rotation 2\\DHFR microstates\\microstates.dat";
targetFreqsAlt = "C:\\users\\candy\\skydrive\\documents\\rotation 2\\ecDHFR_openseq_bacterial_representative_final_align_trim.fasta";

optimizer = Optimizer(MACROSTATES, True);

# slightly different paths on my two computers

optimizer.readTargetFrequencies(targetFreqs);	
optimizer.readFormattedMicrostateData(data);

if similarityMeasure == 0:
	search = CuckooSearch(optimizer.models, JensenShannonDistance(optimizer.targetFrequencies), True, 32, 1, 0.25);
elif similarityMeasure == 1:
	search = CuckooSearch(optimizer.models, CosineSimilarity(optimizer.targetFrequencies), True, 32, 1, 0.25);
elif similarityMeasure == 2:
	search = CuckooSearch(optimizer.models, KLDivergence(optimizer.targetFrequencies), True, 32, 1, 0.25);
elif similarityMeasure == 3:
	search = CuckooSearch(optimizer.models, Chi2Kernel(optimizer.targetFrequencies), True, 32, 1, 0.25);
elif similarityMeasure == 4:
	search = CuckooSearch(optimizer.models, EntropyWeightedSimilarity(JensenShannonDistance(), optimizer.targetFrequencies), True, 32, 1, 0.25);
elif similarityMeasure == 5:
	search = CuckooSearch(optimizer.models, EntropyWeightsMixedSimilarity(JensenShannonDistance(), CosineSimilarity(), optimizer.targetFrequencies), True, 32, 1, 0.25);
else:
	search = CuckooSearch(optimizer.models, JensenShannonDistance(optimizer.targetFrequencies), True, 32, 1, 0.25);

print(search.similarityMeasure.__str__());

search.setMaxIterations(2048);
search.setParamBounds(ensembleSizes, backrubTemps, boltzmannTemps, steepnessRange, minWeights, maxWeights);
Beispiel #9
0
def optimize():
    if microstate_optimize == True:
        # init search algorithm
        #search = CuckooSearch(optimizer.models, JensenShannonDistance(optimizer.targetFrequencies), True, 64, 1, 0.25)
        if simMeas_id == 'JS':
            search = CuckooSearch(
                optimizer.models,
                JensenShannonDistance(optimizer.targetFrequencies), True, 64,
                1, 0.25)
        elif simMeas_id == 'CS':
            search = CuckooSearch(
                optimizer.models,
                CosineSimilarity(optimizer.targetFrequencies), True, 64, 1,
                0.25)
        # elif simMeas_id == 'EW':
        #     search = CuckooSearch(optimizer.models, EntropyWeightedSimilarity(CosineSimilarity, optimizer.targetFrequencies), True, 64, 1, 0.25)
        # elif simMeas_id == 'EWM':
        #     search = CuckooSearch(optimizer.models, EntropyWeightsMixedSimilarity(optimizer.targetFrequencies), True, 64, 1, 0.25)
        elif simMeas_id == 'KL':
            search = CuckooSearch(optimizer.models,
                                  KLDivergence(optimizer.targetFrequencies),
                                  True, 64, 1, 0.25)
        elif simMeas_id == 'C2':
            search = CuckooSearch(optimizer.models,
                                  Chi2Kernel(optimizer.targetFrequencies),
                                  True, 64, 1, 0.25)

        search.setMaxIterations(iterations)
        # set parameters
        search.setParamBounds(ensembleSizes, backrubTemps, boltzmannTemps,
                              steepnessRange, minWeights, maxWeights)
        search.setSearchParameters(True, False, True, True, usedstates)
        # load search algorithm
        optimizer.useAlgorithm(search)
        # optimize
        optimizer.optimize()
        #now = datetime.now()
        optimizer.writeFrequenciesToFASTA(
            optimizer.getBestFrequencies(),
            os.path.join(output_path, "var_ensembles_" + job_tag + ".fasta"))
        optimizer.writeBestParamsToText(
            os.path.join(output_path, "var_ensembles_" + job_tag))
    else:
        # init search algorithm
        if simMeas_id == 'JS':
            search = CuckooSearch(
                optimizer.models,
                JensenShannonDistance(optimizer.targetFrequencies), False, 64,
                1, 0.25)
        elif simMeas_id == 'CS':
            search = CuckooSearch(
                optimizer.models,
                CosineSimilarity(optimizer.targetFrequencies), False, 64, 1,
                0.25)
        # elif simMeas_id == 'EW':
        #     search = CuckooSearch(optimizer.models, EntropyWeightedSimilarity(optimizer.targetFrequencies), False, 64, 1, 0.25)
        # elif simMeas_id == 'EWM':
        #     search = CuckooSearch(optimizer.models, EntropyWeightsMixedSimilarity(optimizer.targetFrequencies), False, 64, 1, 0.25)
        elif simMeas_id == 'KL':
            search = CuckooSearch(optimizer.models,
                                  KLDivergence(optimizer.targetFrequencies),
                                  False, 64, 1, 0.25)
        elif simMeas_id == 'C2':
            search = CuckooSearch(optimizer.models,
                                  Chi2Kernel(optimizer.targetFrequencies),
                                  False, 64, 1, 0.25)

        search.setMaxIterations(iterations)
        # set parameters
        search.setParamBounds(ensembleSizes, backrubTemps, boltzmannTemps,
                              steepnessRange, minWeights, maxWeights)
        search.setSearchParameters(False, False, False, True, usedstates)
        # load search algorithm
        optimizer.useAlgorithm(search)
        # optimize
        optimizer.optimize()
        #now = datetime.now()
        optimizer.writeFrequenciesToFASTA(
            optimizer.getBestFrequencies(),
            os.path.join(output_path, "fixed_ensembles_" + job_tag + ".fasta"))
        optimizer.writeBestParamsToText(
            os.path.join(output_path, "fixed_ensembles_" + job_tag))
Beispiel #10
0
class SentenceProcessor:
    def __init__(self, sentence, window_size):

        self.operands = []
        self.nouns = []
        self.sentence = sentence.lower()
        self.autocorrect()
        self.tags = dict(
        )  #This will store the key as the word in the sentence and the appropriate tag as the value. less_than  is tagged as operator
        self.transformedSentence = None
        self.window = int(window_size)
        self.operandDictionaryFile = 'operandDictionary.txt'
        self.operatorDictionaryFile = 'operatorDictionary.txt'
        self.words = []
        self.operandDictionary = []
        self.operatorDictionary = [
        ]  # not used anymore, not needed. Make the code which uses this inactive, but don't remove
        self.operatorMapping = [
        ]  # this is a list of tuples e.g. ('plus', '+'), ('add', '+'), ('times', '*')
        self.operators = []  # list of operators that we support
        self.tokens = []  # this is the final list of tokens after processing
        self.phrases_k_size = []
        self.tagged_operands = []
        self.tagged_operators = []
        self.phrases_1_size_operator = []
        self.cosineSim = CosineSimilarity()
        self.initializeDictionary()
        self.initializeSentence()

        #self.initializePhrasesOfSizeK()
        #self.initializePhrasesOfSize1Operator()

        self.merger = []

        # This is a list and will store the transformed sentence made entirely of keys from the tags and relative position as in
        # the original sentence
    def autocorrect(self):
        line = ''
        for word in self.sentence.split(' '):
            if word.isdigit():
                line = line + ' ' + word
            else:
                line = line + ' ' + spell(word)
        self.sentence = line

    def initializeNouns(self):
        self.tokens = nltk.word_tokenize(' '.join(self.words))
        self.tagged = nltk.pos_tag(self.words)
        self.nouns = [word for word,pos in self.tagged \
                      if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS')]
        self.nouns = [x.lower() for x in self.nouns]

    #USED
    def initializeDictionary(self):
        with open(self.operandDictionaryFile, "r") as ins:
            array = []
            for line in ins.readlines():
                splits = line.split("\n")
                splits = splits[
                    0:-1]  #removing the extra '' blank character getting added
                self.operandDictionary.append(splits)
                self.operands.append(splits[0])

        array = []
        with open(self.operatorDictionaryFile, "r") as ins:
            for line in ins.readlines():
                #print line
                splits = line.split(",")
                self.operators.append(splits[-1][0:-1].strip())
                for _ in splits[0:-1]:
                    array.append((_.strip(), splits[-1][0:-1]))
        #print array
        self.operatorMapping = sorted(array, key=lambda x: -len(x[0]))
        print self.operators
        #print array

    def initializeSentence(self):
        print "original:", self.sentence
        # remove articles/stop words
        for _ in stopwords:
            self.sentence = re.sub(" " + _ + " ", ' ', self.sentence)
            if (self.sentence.split(' ', 1)[0] == _):
                self.sentence = self.sentence.split(' ', 1)[1]

        print "after removing articles:", self.sentence

        # remove "replacements"
        for _ in replacements:
            self.sentence = re.sub(" " + _ + " ", ' ', self.sentence)
        self.sentence = re.sub('is is is', 'is', self.sentence)
        self.sentence = re.sub('is is', 'is', self.sentence)

        print "after removing replacements:", self.sentence

        # replace text with actual operators
        for _ in self.operatorMapping:
            self.sentence = re.sub(_[0], " " + _[1] + " ", self.sentence)
        print "after replacing text with operators:", self.sentence

        # change <number><unit> to <number> e.g. 10meters --> 10
        self.sentence = re.sub('(\d+)[^ \d]*', ' \g<1>', self.sentence)
        print "after changing numbers:", self.sentence

        if self.sentence != None:
            self.words = self.sentence.split()  #a list of words
            print "after splitting the processed sentence:", self.words

        # This is for merging the parts of string
        # e.g. we have all words in the sentence as a list at this moment.
        # after this step, we will have <part A> <operator> <part B>
        self.tokens = []
        n = len(self.words)
        i = 0
        token = ""
        while i < n:
            if self.words[i] in self.operators or self.words[
                    i] in reserved_words:
                self.tokens.append(token.strip())
                token = ""
                self.tokens.append(self.words[i])
            else:
                token += " " + self.words[i]
            i += 1
        self.tokens.append(token)
        print "After merging the parts of sentence together:", self.tokens
        self.processTokens()
        print "After processing of tokens ", self.tokens
        if self.tokens != None and len(
                self.tokens) > 0 and self.tokens[0] != None:
            self.expression = ' '.join(self.tokens)
        print "The mathematical expression is ", self.expression

    def processTokens(self):
        isRange = False
        isOfType = False  #manages subtraction of, product of, i.e. sum of A and B types
        isOfTypeOp = ''  #stores the operator being considered
        prevNoun = ''
        for i in range(0, len(self.tokens)):
            token = self.tokens[i]
            if token not in self.operators and token not in reserved_words:
                if isRange:  #i.e. we are talking about ranges
                    list = token.split('and')
                    operand1 = ''
                    operand2 = ''
                    if len(list) >= 2:
                        operand1 = list[0].strip()
                        operand2 = list[1].strip()

                    if (operand1.isdigit() == True
                            and operand2.isdigit() == True):
                        self.tokens[i - 1] = 'in [' + operand1 + ','
                        self.tokens[i] = operand2 + ']'
                        isRange = False
                        continue
                    else:
                        operand1 = self.operandMatching(operand1, 0.7)
                        operand2 = self.operandMatching(operand2, 0.7)
                        self.tokens[
                            i - 1] = prevNoun.upper() + '{' + operand1 + '} - '
                        self.tokens[i] = prevNoun.upper(
                        ) + '{' + operand2 + '}'
                        isRange = False
                        continue
                if isOfType == True:
                    list = token.split('and')
                    operand1 = list[0]
                    operand2 = list[1]
                    self.tokens[i - 1] = '(' + operand1 + ' ' + isOfTypeOp
                    self.tokens[i] = operand2 + ')'
                    isOfType = False
                    isOfTypeOp = ''
                    continue

                operand = self.operandMatching(token, 0.7)
                if operand != '':
                    self.tokens[i] = operand
                    continue
                if token.strip().isdigit():
                    continue
                prevNoun = token  #if everything fails, this means that this has to be a noun
                self.tokens[i] = ''
            elif token == '[':
                isRange = True
            elif token == '++' or token == '--' or token == '//' or token == '**':
                isOfType = True
                isOfTypeOp = token[0:-1]
            elif token == 'by':
                prevOperatorPos = self.prevOperator()
                if self.tokens[prevOperatorPos] == '>' or self.tokens[
                        prevOperatorPos] == '>=':
                    self.tokens[i] = '+'
                    self.tokens[prevOperatorPos] = '='
                elif self.tokens[prevOperatorPos] == '<' or self.tokens[
                        prevOperatorPos] == '<=':
                    self.tokens[i] = '-'
                    self.tokens[prevOperatorPos] = '='

    def prevOperator(self):
        operand = ''
        pos = -1
        for i in range(0, len(self.tokens)):
            token = self.tokens[i]
            if token in self.operators:
                operand = token
                pos = i
        return pos

    def convertToSynset(self, phrase):
        list = phrase.strip().split()
        maxSize = 10
        mat = [['' for x in range(len(list))] for y in range((maxSize))]

        for col in range(0, len(list)):
            li = wn.synsets(list[col])
            print li
            for row in range(0, maxSize):
                str = ''
                if list[col].isdigit():
                    mat[row][col] = list[col]
                elif row < len(li) and col < len(mat[0]) and row < len(mat):
                    str = li[row].name()
                    posOfDot = str.index('.')
                    str = str[:posOfDot]
                    mat[row][col] = str
        # for r in range(0,len(mat)):
        #     for c in range(0,len(mat[0])):
        #         print mat[r][c]+' '
        #     print '\n'
        return mat

    def operandMatching(self, phrase, threshold):
        if len(phrase) < 3:
            if len(phrase.strip().split(' ')) > 0:
                return ('_').join(phrase.strip().split(' '))
            return phrase
        maxMatch = 0
        operand = ''
        phrases = [[]]
        phrases = self.convertToSynset(phrase)  #2 D matrix of synonyms
        self.maxMatch = 0
        self.threshold = 0.8
        self.operand = ''
        print "The phrases are ", phrases
        for dictionaryWordArr in self.operandDictionary:  #each line can have several similar meaning words
            text1 = dictionaryWordArr[0].split('_')
            text1 = ' '.join(text1)
            text1 = text1.lower()
            visited = [[False for x in range(len(phrases[0]))]
                       for y in range((len(phrases)))]
            self.DFS(visited, phrases, 0, 0, '', text1, dictionaryWordArr[0],
                     phrase)

        return self.operand

        #         phrase=' '.join(phrases[row])
        #         print "Phrase is ",phrase
        #         b=self.match1(phrase,text1)
        #         if b>threshold and b>maxMatch or (text1.lower()==phrase.lower()):
        #             maxMatch=b
        #             operand=dictionaryWordArr[0]
        # if operand!='':
#     return operand
# phrase=phrase.strip()
# list=phrase.split(' ')
# if list[0].isdigit():
#     return phrase

#return '_'.join(phrase.strip().split(' '))

    def check(self, phrase, text1, operan, originalStr):
        print "Tried matching ", phrase, text1
        if phrase == '':
            return
        b = self.match1(phrase, text1)
        if b > self.threshold and b > self.maxMatch or (text1.lower()
                                                        == phrase.lower()):
            self.maxMatch = b
            self.operand = operan
        if self.operand != '':
            return self.operand
        originalStr = originalStr.strip()
        list = originalStr.split(' ')
        if list[0].isdigit():
            self.operand = phrase
            return self.operand
        if phrase != '':
            self.operand = '_'.join(phrase.strip().split(' '))

        return self.operand

    def DFS(self, visited, phrases, row, col, phrase, text1, operand,
            originalStr):
        if len(originalStr) < 3:
            return originalStr
        if col == len(visited[0]) and row < len(visited) and phrases != '':
            self.check(phrase, text1, operand, originalStr)  #TODO
            return
        if col >= len(visited[0]) or row >= len(visited):
            return
        if visited[row][col] == True:
            return

        visited[row][col] = True
        phrase1 = phrase + ' ' + phrases[row][col]
        self.DFS(visited, phrases, row, col + 1, phrase1, text1, operand,
                 originalStr)
        visited[row][col] = False
        self.DFS(visited, phrases, row + 1, col, phrase, text1, operand,
                 originalStr)  #backtracking

    def match1(self, s1, s2):
        sim = 0
        try:
            sim = self.cosineSim.cosine_sim(s1, s2)
        except:
            pass
        #print "similarity between", text1, text2, sim
        return sim

    def match(self, phrases, word_splits):
        if phrases == None or word_splits == None or len(phrases) == 0 or len(
                word_splits) == 0:
            return False
        text1 = ''
        text2 = ''
        for w1 in phrases:
            if len(w1) != 0 and w1 != '[]':
                text1 = w1 + " " + text1
        for w2 in word_splits:
            if len(w2) != 0 and w2 != '[]':
                text2 = w2 + " " + text2

        try:
            sim = self.cosineSim.cosine_sim(text1.lower(), text2.lower())
        except:
            print text1, text2
            sim = self.cosineSim.cosine_sim(text1.lower(), text2.lower())
            pass
        #print "similarity between", text1, text2, sim
        return sim

    #processes a single sentence
    def processSentence(self):

        #self.operatorTagging()
        self.operandTagging()
        self.merge()

    def initializePhrasesOfSizeK(self):
        for k in range(0,
                       len(self.words) - self.window +
                       1):  #possible starting points of the string
            phrase = self.words[k:k + int(self.window)]
            self.phrases_k_size.append(phrase)

    def initializePhrasesOfSize1perator(self):
        for k in range(0, len(
                self.words)):  #possible starting points of the string
            phrase = self.words[k:k + 1]
            self.phrases_1_size_operator.append(phrase)

    def merge(self):
        self.merger = []
        prevWasOperator = False
        prevWasOperand = False
        prevOperatorIndex = -1
        prevOperandIndex = -1
        prevLeftOutOperandIndex = -1
        prevLeftOutOperatorIndex = -1
        for i in range(0, len(self.words)):
            operand = self.tagged_operands[i]
            operator = self.tagged_operators[i]
            self.merger.append('')
            if operand == '' and operator == '':  #check if any variable name associated, nouns are important
                if self.words[i].lower() in self.nouns:
                    self.merger[i] = self.words[i]

            if operand == '' and operator == '':
                if self.words[i].isdigit():
                    self.merger[i] = self.words[i]

            if operator == '' and operator == '':
                if self.words[i].lower() == 'by':
                    if prevOperatorIndex >= 0 and self.merger[
                            prevOperatorIndex] == '>' or self.merger[
                                prevOperatorIndex] == '>=':
                        self.merger[i] = '+'
                        self.merger[prevOperatorIndex] = '='
                        prevOperatorIndex = i
                    elif prevOperatorIndex >= 0 and self.merger[
                            prevOperatorIndex] == '<' or self.merger[
                                prevOperatorIndex] == '<=':
                        self.merger[i] = '-'
                        self.merger[prevOperatorIndex] = '='
                        prevOperatorIndex = i

            if prevWasOperand == False and prevWasOperator == False:
                if operand != '' and len(operand) >= 1:
                    self.merger[i] = operand
                    prevWasOperand = True
                    prevWasOperator = False
                    prevOperandIndex = i

            if prevWasOperand:
                if operator != '' and len(operator) >= 1:
                    if prevLeftOutOperandIndex >= 0:
                        self.merger[prevLeftOutOperandIndex] = operator
                        self.merger[i] = self.tagged_operands[
                            prevLeftOutOperandIndex]
                        prevWasOperator = False
                        prevWasOperand = True
                        prevOperatorIndex = prevLeftOutOperandIndex
                        prevOperandIndex = i
                        prevLeftOutOperandIndex = -1
                    else:
                        self.merger[i] = operator
                        prevWasOperator = True
                        prevWasOperand = False
                        prevOperatorIndex = i
                if operand != '' and len(operand) >= 1 and not (
                        prevOperandIndex >= 0
                        and operand == self.merger[prevOperandIndex]):
                    prevLeftOutOperandIndex = i  #this operand has been left out, as we are only looking for operator right now, also check that this should not be a repeating operand
            if prevWasOperator:
                if operand != '' and len(operand) >= 1:
                    if prevLeftOutOperatorIndex >= 0:
                        self.merger[prevLeftOutOperatorIndex] = operand
                        self.merger[i] = self.tagged_operators[
                            prevLeftOutOperatorIndex]
                        prevWasOperator = True
                        prevWasOperand = False
                        prevOperatorIndex = prevLeftOutOperandIndex
                        prevOperandIndex = i
                        prevLeftOutOperandIndex = -1
                    else:
                        self.merger[i] = operand
                        prevWasOperator = False
                        prevWasOperand = True
                        prevOperandIndex = i
                if operator != '' and len(operator) >= 1 and not (
                        prevOperatorIndex >= 0
                        and operator == self.merger[prevOperatorIndex]):
                    prevLeftOutOperatorIndex = i  #this operand has been left out, as we are only looking for operator right now, also check that this should not be a repeating operand

        print self.merger  #all singly occurring words are nouns, merge all before and after any operator
        self.convertToScietific()
        print self.expression

    def convertToScietific(self):
        self.expression = ''
        prevWasOperator = False
        prevWasOperand = False
        prevWasNoun = False
        for i in range(0, len(self.merger)):
            word = self.merger[i]
            if word in self.operands:  #means its an operand
                if prevWasOperand == False:
                    if prevWasNoun == True:
                        self.expression = self.expression + ')'
                        prevWasNoun = False
                    self.expression = self.expression + ' ' + word
                    prevWasOperand = True
                    prevWasOperator = False
            elif word in self.operators:
                if prevWasOperator == False:
                    if prevWasNoun == True:
                        self.expression += ')'
                        prevWasNoun = False
                    self.expression += ' ' + word
                    prevWasOperator = True
                    prevWasOperand = False
            elif word.lower() in self.nouns:
                if prevWasNoun == False:  #we can add a bracket and start writing the name
                    self.expression += '(' + word.upper()
                    prevWasNoun = True
                elif prevWasNoun == True:
                    self.expression += ' ' + word.upper()
                    prevWasNoun = True
            else:
                self.expression += ' ' + word
        if prevWasNoun == True:
            self.expression += ')'

    #TODO Avinash
    def operandTagging(self):
        print "Start the operand tagging"
        for i in range(len(self.words)):
            if self.words[i].isdigit():
                self.tagged_operands.append(self.words[i])
            else:
                self.tagged_operands.append('')
        for i in range(len(
                self.phrases_k_size)):  #k size phrases in a sentence
            phrase = self.phrases_k_size[i]
            phrase = ' '.join(phrase)
            threshold = 0.9
            maxMatch = 0
            for dictionaryWordArr in self.operandDictionary:  #each line can have several similar meaning words
                text1 = dictionaryWordArr[0].split('_')
                text1 = ' '.join(text1)
                b = self.match(phrase, text1)
                if b > threshold and b > maxMatch:
                    maxMatch = b
                    for i1 in range(i, i + self.window):
                        if self.tagged_operands[i1].isdigit() == False:
                            self.tagged_operands[i1] = dictionaryWordArr[0]
                            break
        print "Tagged operand array ", self.tagged_operands
        print self.words

    #TODO Bhavesh. Not used
    def operatorTagging(self):
        print "Start the operator tagging "  # plus,add,sum,addition:+
        for i in range(len(self.words)):
            self.tagged_operators.append('')

        for i in range(len(self.phrases_k_size)):
            operatorPhrase = self.phrases_k_size[i][
                0:-1]  #this will give me the phrase. greater than the
            self.tagged_operators.append('')
            threshold = 0.7
            maxMatch = 0
            for key in self.operatorMapping:  #we want to avoid the last character
                text = " ".join(str(x) for x in operatorPhrase)
                key1 = key[0]
                b = self.match(key1, text)
                if b >= threshold and b > maxMatch:
                    maxMatch = b
                    self.tagged_operators[i] = key[-1]
        print "Tagged operator array ", self.tagged_operators
        print self.words
Beispiel #11
0
  with open(fname, 'rb') as f:
    return marshal.load(f)

if __name__ == '__main__':
    if len(sys.argv) < 4:
        print "Usage: python rank.py task_number queryDocTrainData queryDocTrainRel queryDocTestData"
        exit(0)
    task_number = sys.argv[1]
    training_file = sys.argv[4]
    #Load idf
    idf = unserialize_data('model/idf')
    avg_doc_length = unserialize_data('model/avg_file_length')
    weights_file = open('Weights','w')
    if task_number == '1':
        #Build Cosine similarity measure
        scorer = CosineSimilarity(training_file, idf)
        weights_file.write(str(scorer.C1) + ' ' + str(scorer.C2) + ' '  + str(scorer.C3))
        weights_file.close()
    elif task_number == '2':
        #Build BM25 similarity measure
        scorer = BM25(training_file, idf, avg_doc_length)
        weights_file.write(str(scorer.W_title) + ' ' + str(scorer.W_body) + ' '  + str(scorer.W_anchor) + ' ' +\
                           str(scorer.B_title) + ' ' + str(scorer.B_body) + ' ' + str(scorer.B_anchor) + str(scorer.K1))
        weights_file.close
    elif task_number == '3':
        #Build smallest window similarity measure with Cosine Similarity
        scorer = SmallestWindow(training_file,"cosinesimilarity", idf)
        weights_file.write(str(scorer.C1) + ' ' + str(scorer.C2) + ' '  + str(scorer.C3) +  ' ' + str(scorer.B))
        weights_file.close()
    elif task_number == '4':
        #Build page_inportance model
Beispiel #12
0
        return marshal.load(f)


if __name__ == '__main__':
    if len(sys.argv) < 4:
        print "Usage: python rank.py task_number queryDocTrainData queryDocTrainRel queryDocTestData"
        exit(0)
    task_number = sys.argv[1]
    training_file = sys.argv[4]
    #Load idf
    idf = unserialize_data('model/idf')
    avg_doc_length = unserialize_data('model/avg_file_length')
    weights_file = open('Weights', 'w')
    if task_number == '1':
        #Build Cosine similarity measure
        scorer = CosineSimilarity(training_file, idf)
        weights_file.write(
            str(scorer.C1) + ' ' + str(scorer.C2) + ' ' + str(scorer.C3))
        weights_file.close()
    elif task_number == '2':
        #Build BM25 similarity measure
        scorer = BM25(training_file, idf, avg_doc_length)
        weights_file.write(str(scorer.W_title) + ' ' + str(scorer.W_body) + ' '  + str(scorer.W_anchor) + ' ' +\
                           str(scorer.B_title) + ' ' + str(scorer.B_body) + ' ' + str(scorer.B_anchor) + str(scorer.K1))
        weights_file.close
    elif task_number == '3':
        #Build smallest window similarity measure with Cosine Similarity
        scorer = SmallestWindow(training_file, "cosinesimilarity", idf)
        weights_file.write(
            str(scorer.C1) + ' ' + str(scorer.C2) + ' ' + str(scorer.C3) +
            ' ' + str(scorer.B))