コード例 #1
0
ファイル: main.py プロジェクト: jungin/SimilarQuestions
def main():
    questionsFile = open('../../data/python_questions_abridged.txt')
    questions = []
    
    for q in questionsFile:
        questions += [json.loads(q)]
    
    index = Index(questions)
    
    vectorizers = {}
    tfidf = Vectorizer_TFIDF()
    
    vectorizers[tfidf.getName()] = tfidf
    
    
    engine = ExperimentEngine(index, vectorizers)
    
    stats = engine.runExperiments()
コード例 #2
0
ファイル: main.py プロジェクト: pkushiqiang/SimilarQuestions
def main():
    
    
    if 'Python' in sys.argv:
        Vectorizer.dataset = 'Python'    
        questionsfilename = '../../data/relevant_python_questions_with_body.txt'
        linkedfilename = '../../data/full_python_linked.txt'
    if 'English' in sys.argv:
        Vectorizer.dataset = 'English'
        questionsfilename = '../../data/relevant_english_questions_with_body.txt'
        linkedfilename = '../../data/full_english_linked.txt'
    if 'Combined' in sys.argv:
        Vectorizer.dataset = 'Combined'   
        questionsfilename = '../../data/relevant_combined_questions_with_body.txt'    
        linkedfilename = '../../data/full_combined_linked.txt'

    questionsFile = open(questionsfilename)
    
    questions = []
    
    for q in questionsFile:
        questions += [json.loads(q)]


    if 'n-gram' in ' '.join(sys.argv):
        gram = int(sys.argv[-1])
    else:
        gram = 1
    index = Index(questions,ngram=gram)
    
    
    vectorizers = {}
    # if we want to call only some of them (from commandline)...
    if len(sys.argv) > 1:
        
        if 'tfidf' in sys.argv:
            tfidf = Vectorizer_TFIDF()  
            vectorizers[tfidf.getName()] = tfidf
        if 'tfidf-title' in sys.argv:
            tfidf = Vectorizer_TFIDF('title')
            vectorizers[tfidf.getName()] = tfidf
        if 'tfidf-body' in sys.argv:
            tfidf = Vectorizer_TFIDF('body')
            vectorizers[tfidf.getName()] = tfidf

        if 'synonym' in sys.argv:
            if sys.argv[-1].isdigit():
                num_syn = sys.argv[-1]
            else:
                num_syn = 2 
            synonym = Vectorizer_Synonym(num_syn)  
            vectorizers[synonym.getName()] = synonym                
        if 'synonym-title' in sys.argv:
            if sys.argv[-1].isdigit():
                num_syn = sys.argv[-1]
            else:
                num_syn = 2 
            synonym = Vectorizer_Synonym(num_syn, 'title')  
            vectorizers[synonym.getName()] = synonym     
        if 'synonym-body' in sys.argv:
            if sys.argv[-1].isdigit():
                num_syn = sys.argv[-1]
            else:
                num_syn = 2 
            synonym = Vectorizer_Synonym(num_syn, 'body')  
            vectorizers[synonym.getName()] = synonym  
        
        if 'nounverb' in sys.argv:
            nounverb = Vectorizer_NV()  
            vectorizers[nounverb.getName()] = nounverb
        if 'nounverb-title' in sys.argv:
            nounverb = Vectorizer_NV('title')
            vectorizers[nounverb.getName()] = nounverb
        if 'nounverb-body' in sys.argv:
            nounverb = Vectorizer_NV('body')
            vectorizers[nounverb.getName()] = nounverb 
            
        if 'n-gram' in sys.argv:
            if sys.argv[-1].isdigit():
                grams = int(sys.argv[-1])
            else:
                grams = 2 
            ngram = Vectorizer_NGram(grams)  
            vectorizers[ngram.getName()] = ngram        
        if 'n-gram-title' in sys.argv:
            if sys.argv[-1].isdigit():
                grams = int(sys.argv[-1])
            else:
                grams = 2 
            ngram = Vectorizer_NGram(grams,'title')  
            vectorizers[ngram.getName()] = ngram  
        if 'n-gram-body' in sys.argv:
            if sys.argv[-1].isdigit():
                grams = int(sys.argv[-1])
            else:
                grams = 2 
            ngram = Vectorizer_NGram(grams,'body')  
            vectorizers[ngram.getName()] = ngram                

    else: # if we want to run all of them
        tfidf = Vectorizer_TFIDF()
        vectorizers[tfidf.getName()] = tfidf
        tfidf = Vectorizer_TFIDF('title')
        vectorizers[tfidf.getName()] = tfidf
        tfidf = Vectorizer_TFIDF('body')
        vectorizers[tfidf.getName()] = tfidf
        
    linkedFile = open(linkedfilename)
    f = linkedFile.read()
    #print f
    linkedDocs = makeIntIndexes(json.loads(f))
        
    #print "linkedDocs", linkedDocs 
    engine = ExperimentEngine(index, linkedDocs, vectorizers)
    
    stats = engine.runExperiments()