def main(originalFile, w2vFile, w2vDimension, topicModelFile,
         topicModelDimension, infoInstance, tfidfInstance, hasUrlInstance,
         ansProInstance):

    bowDict = {}
    w2vDict = {}
    tmDict = {}

    cuserComQuser = {}  #cid, 0 or 1, compared with quserid
    ansProDict = {}  #cid, category_cgold probability
    tfidfDict = {}  #cid, tfidfScore
    urlDict = {}

    resultDict = {}

    utility = Utility()
    w2v = W2V(w2vFile, w2vDimension)
    tm = TopicModel(topicModelFile, topicModelDimension)

    files = [f for f in listdir(originalFile) if isdir(join(originalFile, f))]
    for directory in files:
        path = originalFile + directory
        fileList = [f for f in listdir(path) if isfile(join(path, f))]

        #question file
        with open(path + "/" + directory, "r") as fin:
            s1 = fin.read()
            vec1 = w2v.sentenceVector(s1)
            t1 = tm.getProbability(directory)

        #comment file
        for each in fileList:
            if each == directory:
                continue

            qid = directory
            cid = each
            cuserid = infoInstance.cidToCuserid(cid)
            quserid = infoInstance.cidToQuserid(cid)
            qcategory = infoInstance.qidToCategory(qid)

            if cuserid == quserid:
                cuserComQuser[cid] = 1.0
            else:
                cuserComQuser[cid] = 0.0
            '''
            #notice, record the categoryAnsPro of train set first using following commands  
            #after that you can use the command of "ansProDict[cid] = ansProInstance.getCategoryPro(qcategory)" in train, dev and test set
            
            ansProDict[cid] = infoInstance.getCategoryAnsPro(qcategory)
            cg = open("categoryAnsProTrain.txt", "a+")
            cg.write(qcategory + "\t")
            for i in range(len(ansProDict[cid])):
                cg.write(str(ansProDict[cid][i]) + "\t")
            cg.write("\n")
            '''

            ansProDict[cid] = ansProInstance.getCategoryPro(qcategory)
            tfidfDict[cid] = tfidfInstance.getTfidfScore(cid)
            urlDict[cid] = hasUrlInstance.isExistUrl(cid)

            completePath = path + "/" + each
            with open(completePath, "r") as fin:
                s2 = fin.read()
                #some questions & comments are empty after preProcessing
                if not s1 or not s2:
                    bowDict[each] = 0.000000000001
                    w2vDict[each] = 0.000000000001
                    tmDict[each] = 0.000000000001
                    continue

                bow = BOW(s1, s2)
                v1, v2 = bow.getVector()
                score = utility.cosine(v1, v2)
                bowDict[each] = score

                vec2 = w2v.sentenceVector(s2)
                score = utility.cosine(vec1, vec2)
                w2vDict[each] = score

                t2 = tm.getProbability(each)
                score = utility.cosine(t1, t2)
                tmDict[each] = score

    print "bowDict, w2vDict, tmDict done!"
    for key in bowDict:
        aList = []
        aList.append(bowDict[key])
        aList.append(w2vDict[key])
        aList.append(tmDict[key])
        aList.append(cuserComQuser[key])
        for i in range(len(ansProDict[key])):
            aList.append(ansProDict[key][i])
        aList.append(tfidfDict[key])
        aList.append(urlDict[key])
        resultDict[key] = aList
    print "resultDict done!"
    return resultDict
def main(originalFile, w2vFile, w2vDimension, topicModelFile,
         topicModelDimension, infoInstance, tfidfInstance, hasUrlInstance,
         ansProInstance, ynInstance):

    cidList = ynInstance.getCidList()
    cidMap = {}
    for i in range(len(cidList)):
        cidMap[cidList[i]] = 0

    bowDict = {}
    w2vDict = {}
    tmDict = {}

    cuserComQuser = {}  #cid, 0 or 1, compared with quserid
    ansProDict = {}  #cid, category_cgold probability
    tfidfDict = {}  #cid, tfidfScore
    urlDict = {}

    resultDict = {}

    utility = Utility()
    w2v = W2V(w2vFile, w2vDimension)
    tm = TopicModel(topicModelFile, topicModelDimension)

    files = [f for f in listdir(originalFile) if isdir(join(originalFile, f))]
    for directory in files:
        path = originalFile + directory
        fileList = [f for f in listdir(path) if isfile(join(path, f))]
        #question file
        with open(path + "/" + directory, "r") as fin:
            s1 = fin.read()
            vec1 = w2v.sentenceVector(s1)
            t1 = tm.getProbability(directory)

        #comment file
        for each in fileList:
            if each == directory:
                continue
            if each not in cidMap:
                break

            qid = directory
            cid = each
            cuserid = infoInstance.cidToCuserid(cid)
            quserid = infoInstance.cidToQuserid(cid)
            qcategory = infoInstance.qidToCategory(qid)

            if cuserid == quserid:
                cuserComQuser[cid] = 1.0
            else:
                cuserComQuser[cid] = 0.0

            ansProDict[cid] = ansProInstance.getCategoryPro(qcategory)
            tfidfDict[cid] = tfidfInstance.getTfidfScore(cid)
            urlDict[cid] = hasUrlInstance.isExistUrl(cid)

            completePath = path + "/" + each
            with open(completePath, "r") as fin:
                s2 = fin.read()
                #some questions & comments are empty after preProcessing
                if not s1 or not s2:
                    bowDict[each] = 0.000000000001
                    w2vDict[each] = 0.000000000001
                    tmDict[each] = 0.000000000001
                    continue

                bow = BOW(s1, s2)
                v1, v2 = bow.getVector()
                score = utility.cosine(v1, v2)
                bowDict[each] = score

                vec2 = w2v.sentenceVector(s2)
                score = utility.cosine(vec1, vec2)
                w2vDict[each] = score

                t2 = tm.getProbability(each)
                score = utility.cosine(t1, t2)
                tmDict[each] = score
                '''
                print bowDict
                print w2vDict
                print tmDict
                '''
    print "bowDict, w2vDict, tmDict done!"

    for key in bowDict:
        aList = []
        aList.append(bowDict[key])
        aList.append(w2vDict[key])
        aList.append(tmDict[key])
        aList.append(cuserComQuser[key])
        for i in range(len(ansProDict[key])):
            aList.append(ansProDict[key][i])
        aList.append(tfidfDict[key])
        aList.append(urlDict[key])
        resultDict[key] = aList
    print "resultDict done!"
    return resultDict
def main(originalFile, w2vFile, w2vDimension, topicModelFile, topicModelDimension, infoInstance, tfidfInstance, hasUrlInstance, ansProInstance):

    bowDict = {}
    w2vDict = {}
    tmDict = {}
    
    cuserComQuser = {}  #cid, 0 or 1, compared with quserid
    ansProDict = {}     #cid, category_cgold probability
    tfidfDict = {}      #cid, tfidfScore
    urlDict = {}
    
    resultDict = {}
    
    utility = Utility()
    w2v = W2V(w2vFile, w2vDimension)
    tm = TopicModel(topicModelFile, topicModelDimension)
    
    files = [f for f in listdir(originalFile) if isdir(join(originalFile, f))]
    for directory in files:
        path = originalFile + directory
        fileList = [f for f in listdir(path) if isfile(join(path, f))]
        
        #question file
        with open(path + "/" + directory, "r") as fin:
            s1 = fin.read()
            vec1 = w2v.sentenceVector(s1)
            t1 = tm.getProbability(directory)
            
        #comment file
        for each in fileList:
            if each == directory:
                continue
            
            qid = directory
            cid = each
            cuserid = infoInstance.cidToCuserid(cid)           
            quserid = infoInstance.cidToQuserid(cid)
            qcategory = infoInstance.qidToCategory(qid) 
            
            
            if cuserid == quserid:
                cuserComQuser[cid] = 1.0
            else:
                cuserComQuser[cid] = 0.0           
           

            '''
            #notice, record the categoryAnsPro of train set first using following commands  
            #after that you can use the command of "ansProDict[cid] = ansProInstance.getCategoryPro(qcategory)" in train, dev and test set
            
            ansProDict[cid] = infoInstance.getCategoryAnsPro(qcategory)
            cg = open("categoryAnsProTrain.txt", "a+")
            cg.write(qcategory + "\t")
            for i in range(len(ansProDict[cid])):
                cg.write(str(ansProDict[cid][i]) + "\t")
            cg.write("\n")
            ''' 
            
            ansProDict[cid] = ansProInstance.getCategoryPro(qcategory)
            tfidfDict[cid] = tfidfInstance.getTfidfScore(cid)
            urlDict[cid] = hasUrlInstance.isExistUrl(cid) 
            
            completePath = path + "/" + each          
            with open(completePath, "r") as fin:
                s2 = fin.read()
                #some questions & comments are empty after preProcessing
                if not s1 or not s2:
                    bowDict[each] = 0.000000000001
                    w2vDict[each] = 0.000000000001
                    tmDict[each] = 0.000000000001
                    continue

                bow = BOW(s1, s2)   
                v1, v2 = bow.getVector()
                score = utility.cosine(v1, v2)
                bowDict[each] = score
                               
                vec2 = w2v.sentenceVector(s2)
                score = utility.cosine(vec1, vec2)               
                w2vDict[each] = score
                
                t2 = tm.getProbability(each)
                score = utility.cosine(t1, t2)
                tmDict[each] = score

    print "bowDict, w2vDict, tmDict done!"    
    for key in bowDict:
        aList = []
        aList.append(bowDict[key])
        aList.append(w2vDict[key])
        aList.append(tmDict[key])
        aList.append(cuserComQuser[key])
        for i in range(len(ansProDict[key])):    
            aList.append(ansProDict[key][i])
        aList.append(tfidfDict[key])
        aList.append(urlDict[key])
        resultDict[key] = aList
    print "resultDict done!"
    return resultDict   
def main(originalFile, w2vFile, w2vDimension, topicModelFile, topicModelDimension, infoInstance, tfidfInstance, hasUrlInstance, ansProInstance, ynInstance):

    cidList = ynInstance.getCidList()
    cidMap = {}
    for i in range(len(cidList)):
        cidMap[cidList[i]] = 0
    
    bowDict = {}
    w2vDict = {}
    tmDict = {}
    
    cuserComQuser = {}  #cid, 0 or 1, compared with quserid
    ansProDict = {}     #cid, category_cgold probability
    tfidfDict = {}      #cid, tfidfScore
    urlDict = {}
    
    resultDict = {}
    
    utility = Utility()
    w2v = W2V(w2vFile, w2vDimension)
    tm = TopicModel(topicModelFile, topicModelDimension)
    
    files = [f for f in listdir(originalFile) if isdir(join(originalFile, f))]
    for directory in files:
        path = originalFile + directory
        fileList = [f for f in listdir(path) if isfile(join(path, f))]
        #question file
        with open(path + "/" + directory, "r") as fin:
            s1 = fin.read()
            vec1 = w2v.sentenceVector(s1)
            t1 = tm.getProbability(directory)
            
        #comment file
        for each in fileList:
            if each == directory:
                continue
            if each not in cidMap:
                break
                
            qid = directory
            cid = each
            cuserid = infoInstance.cidToCuserid(cid)           
            quserid = infoInstance.cidToQuserid(cid)
            qcategory = infoInstance.qidToCategory(qid) 
            
            
            if cuserid == quserid:
                cuserComQuser[cid] = 1.0
            else:
                cuserComQuser[cid] = 0.0           
      
            ansProDict[cid] = ansProInstance.getCategoryPro(qcategory)
            tfidfDict[cid] = tfidfInstance.getTfidfScore(cid)
            urlDict[cid] = hasUrlInstance.isExistUrl(cid) 
              
            completePath = path + "/" + each          
            with open(completePath, "r") as fin:
                s2 = fin.read()
                #some questions & comments are empty after preProcessing
                if not s1 or not s2:
                    bowDict[each] = 0.000000000001
                    w2vDict[each] = 0.000000000001
                    tmDict[each] = 0.000000000001
                    continue

                bow = BOW(s1, s2)   
                v1, v2 = bow.getVector()
                score = utility.cosine(v1, v2)
                bowDict[each] = score
                               
                vec2 = w2v.sentenceVector(s2)
                score = utility.cosine(vec1, vec2)               
                w2vDict[each] = score
                
                t2 = tm.getProbability(each)
                score = utility.cosine(t1, t2)
                tmDict[each] = score
                '''
                print bowDict
                print w2vDict
                print tmDict
                '''
    print "bowDict, w2vDict, tmDict done!"
    
    for key in bowDict:
        aList = []
        aList.append(bowDict[key])
        aList.append(w2vDict[key])
        aList.append(tmDict[key]) 
        aList.append(cuserComQuser[key])
        for i in range(len(ansProDict[key])):    
            aList.append(ansProDict[key][i])
        aList.append(tfidfDict[key])
        aList.append(urlDict[key])
        resultDict[key] = aList
    print "resultDict done!"
    return resultDict