예제 #1
0
def main():
    """Trains all of the language models and tests them on the dev data. Change devPath if you
     wish to do things like test on the training data."""

    trainPath = 'data/tagged-train.dat'
    trainingCorpus = Corpus(trainPath)

    devPath = 'data/tagged-dev.dat'
    devCorpus = Corpus(devPath)

    # print 'Unigram Language Model: '
    # unigramLM = UnigramModel(trainingCorpus)
    # unigramSpell = SpellCorrect(unigramLM, trainingCorpus)
    # unigramOutcome = unigramSpell.evaluate(devCorpus)
    # print str(unigramOutcome)

    # print 'Uniform Language Model: '
    # uniformLM = UniformModel(trainingCorpus)
    # uniformSpell = SpellCorrect(uniformLM, trainingCorpus)
    # uniformOutcome = uniformSpell.evaluate(devCorpus)
    # print str(uniformOutcome)

    # print 'Smooth Unigram Language Model: '
    # smoothUnigramLM = SmoothUnigramModel(trainingCorpus)
    # smoothUnigramSpell = SpellCorrect(smoothUnigramLM, trainingCorpus)
    # smoothUnigramOutcome = smoothUnigramSpell.evaluate(devCorpus)
    # print str(smoothUnigramOutcome)

    print 'Smooth Bigram Language Model: '
    smoothBigramLM = SmoothBigramModel(trainingCorpus)
    smoothBigramSpell = SpellCorrect(smoothBigramLM, trainingCorpus)
    smoothBigramOutcome = smoothBigramSpell.evaluate(devCorpus)
    print str(smoothBigramOutcome)
def main(rootDirectory, words, confidenceLevel, functionType):
    corpus = Corpus(rootDirectory, toLowercase=TO_LOWERCASE, filters=FILTERS)
    #sampler = Sampler(SAMPLE_SIZE, sampleLength=SAMPLE_LENGTH)
    sampler = Sampler(SAMPLE_SIZE,
                      sampleLengthPercentage=SAMPLE_LENGTH_PERCENTAGE)

    documentSamples = {}
    for documentTitle in corpus.documents:
        documentSample = sampler.sample(corpus.documents[documentTitle],
                                        usePercentage=True)
        documentSamples[documentTitle] = documentSample

    wordCounter = WordCounter(words, SAMPLE_SIZE)
    wordCounter.countOccurrences(documentSamples)

    dataLabels = sorted(list(wordCounter.occurrences.keys()))
    dataSets = []
    for dataLabel in dataLabels:
        #dataSet = wordCounter.occurrences[dataLabel]
        dataSet = wordCounter.occurrencesPerMillionWords[dataLabel]
        dataSets.append(dataSet)

    statisticsPlotter = StatisticsPlotter(dataLabels, dataSets, CONFIDENCE,
                                          words)
    statisticsPlotter.plotStatistics(functionType=functionType)
예제 #3
0
def searchengine(directory):
    stopWords = set(stopwords.words("english"))
    # stemming
    ps = PorterStemmer()

    # create InvertedIndex obj
    invertedIndex = InvertedIndex()
    # build the corpus 
    Corp = Corpus()
    corpus = Corp.buildCorpus(directory)
    for docId in corpus: 
        doc = corpus[docId] 
        content = doc.getContent()
        # tokenize 
        tokens = word_tokenize(content)
        
        for token in tokens:
            token = token.lower()
            # apply stemming 
            token = ps.stem(token)

            # remove stopwords 
            if token in stopWords:
                continue
            # add to index 
            invertedIndex.addTerm(token, docId)
        
    return invertedIndex, corpus
예제 #4
0
def test_reverse_markov_dict():
    full_markov_dict = MarkovDict(source=None, depth=1)
    corpus = Corpus(importStrFromFile("./test/corpora/reverse.txt"))
    full_markov_dict.add(corpus)
    bot = MarkovBot(full_markov_dict)
    print(bot.forward_dict.dict)
    print(bot.reverse_dict.dict)
예제 #5
0
    def build_index(self):
        '''
        This function build the inverted index, it inserts the url to the
        doc Table with a doc_id, and insert each token to tokenT table
        and insert token, doc_id, term frequency and weight into the web_index
        Table
        '''

        c = Corpus()
        t = Tokenizer()

        for url, name in c.get_file_name():
            if len(url) > 1000:
                continue
            result = t.tokenize(name)
            if len(result) == 0:
                continue
            print(url)
            doc_id = 1

            #Insert URL to table DOC
            sql = "INSERT INTO web.doc(url) values (%s)"
            val = (url, )
            self.mycursor.execute(sql, val)
            self.mydb.commit()

            print(self.mycursor.rowcount, "was inserted in URL.")

            print(url)
            s_sql = "select id from doc where url=%s"
            self.mycursor.execute(s_sql, val)
            myresult = self.mycursor.fetchone()
            doc_id = myresult[0]
            print("DOC_ID IS " + str(doc_id))

            #Insert token, doc_id, tf into web_index
            t_sql = "INSERT INTO web.web_index(token, doc_id, tf, wt) values (%s,%s,%s,%s)"

            t_val = []
            for token in result.keys():
                t_val.append(
                    (token, doc_id, result[token][0], result[token][1]))

            #print(t_val)

            self.mycursor.executemany(t_sql, t_val)

            self.mydb.commit()
            print(self.mycursor.rowcount, "was inserted in WEB_INDEX.")

            #insert into TokenT table
            count = 0
            for token in result.keys():
                tq = "Insert ignore into tokenT values (%s)"
                tv = (token, )
                self.mycursor.execute(tq, tv)
                self.mydb.commit()
                count += 1

            print("inserted " + str(count) + " Tokens")
예제 #6
0
def main():
  """Sanity checks the edit model on the word 'hi'."""

  trainPath = 'data/tagged-train.dat'
  trainingCorpus = Corpus(trainPath)
  editModel = EditModel("data/count_1edit.txt", trainingCorpus)
  #These are for testing, you can ignore them
  DELETE_EDITS = set(['Edit(editedWord=i, rule=<h|<)', 'Edit(editedWord=h, rule=hi|h)'])
  INSERT_EDITS = set([Edit('ahi','<','<a'),Edit('bhi','<','<b'),Edit('chi','<','<c'),Edit('dhi','<','<d'),Edit('ehi','<','<e'),Edit('fhi','<','<f'),Edit('ghi','<','<g'),Edit('hhi','<','<h'),Edit('ihi','<','<i'),Edit('jhi','<','<j'),Edit('khi','<','<k'),Edit('lhi','<','<l'),Edit('mhi','<','<m'),Edit('nhi','<','<n'),Edit('ohi','<','<o'),Edit('phi','<','<p'),Edit('qhi','<','<q'),
    Edit('rhi','<','<r'),Edit('shi','<','<s'),Edit('thi','<','<t'),Edit('uhi','<','<u'),Edit('vhi','<','<v'),Edit('whi','<','<w'),Edit('xhi','<','<x'),Edit('yhi','<','<y'),Edit('zhi','<','<z'),Edit('hai','h','ha'),Edit('hbi','h','hb'),Edit('hci','h','hc'),Edit('hdi','h','hd'),Edit('hei','h','he'),Edit('hfi','h','hf'),Edit('hgi','h','hg'),Edit('hhi','h','hh'),
    Edit('hii','h','hi'),Edit('hji','h','hj'),Edit('hki','h','hk'),Edit('hli','h','hl'),Edit('hmi','h','hm'),Edit('hni','h','hn'),Edit('hoi','h','ho'),Edit('hpi','h','hp'),Edit('hqi','h','hq'),Edit('hri','h','hr'),Edit('hsi','h','hs'),Edit('hti','h','ht'),Edit('hui','h','hu'),Edit('hvi','h','hv'),Edit('hwi','h','hw'),Edit('hxi','h','hx'),Edit('hyi','h','hy'),Edit('hzi','h','hz'),
    Edit('hia','i','ia'),Edit('hib','i','ib'),Edit('hic','i','ic'),Edit('hid','i','id'),Edit('hie','i','ie'),Edit('hif','i','if'),Edit('hig','i','ig'),Edit('hih','i','ih'),Edit('hii','i','ii'),Edit('hij','i','ij'),Edit('hik','i','ik'),Edit('hil','i','il'),Edit('him','i','im'),Edit('hin','i','in'),Edit('hio','i','io'),Edit('hip','i','ip'),Edit('hiq','i','iq'),Edit('hir','i','ir'),
    Edit('his','i','is'),Edit('hit','i','it'),Edit('hiu','i','iu'),Edit('hiv','i','iv'),Edit('hiw','i','iw'),Edit('hix','i','ix'),Edit('hiy','i','iy'),Edit('hiz','i','iz')])
  TRANPOSE_EDITS = set([Edit('ih','hi','ih')])
  REPLACE_EDITS = set([Edit('ai','h','a'),Edit('bi','h','b'),Edit('ci','h','c'),Edit('di','h','d'),Edit('ei','h','e'),Edit('fi','h','f'),Edit('gi','h','g'),Edit('ii','h','i'),Edit('ji','h','j'),
    Edit('ki','h','k'),Edit('li','h','l'),Edit('mi','h','m'),Edit('ni','h','n'),Edit('oi','h','o'),Edit('pi','h','p'),Edit('qi','h','q'),Edit('ri','h','r'),Edit('si','h','s'),Edit('ti','h','t'),
    Edit('ui','h','u'),Edit('vi','h','v'),Edit('wi','h','w'),Edit('xi','h','x'),Edit('yi','h','y'),Edit('zi','h','z'),Edit('ha','i','a'),Edit('hb','i','b'),Edit('hc','i','c'),Edit('hd','i','d'),Edit('he','i','e'),Edit('hf','i','f'),Edit('hg','i','g'),Edit('hh','i','h'),Edit('hj','i','j'),
    Edit('hk','i','k'),Edit('hl','i','l'),Edit('hm','i','m'),Edit('hn','i','n'),Edit('ho','i','o'),Edit('hp','i','p'),Edit('hq','i','q'),Edit('hr','i','r'),Edit('hs','i','s'),Edit('ht','i','t'),
    Edit('hu','i','u'),Edit('hv','i','v'),Edit('hw','i','w'),Edit('hx','i','x'),Edit('hy','i','y'),Edit('hz','i','z')])

  print "***Code Sanity Check***"
  print "Delete edits for 'hi'"
  checkOverlap(set(editModel.deleteEdits('hi')), DELETE_EDITS)
  print "Insert edits for 'hi'"
  checkOverlap(set(editModel.insertEdits('hi')), INSERT_EDITS)
  print "Transpose edits for 'hi'"
  checkOverlap(set(editModel.transposeEdits('hi')), TRANPOSE_EDITS)
  print "Replace edits for 'hi'"
  checkOverlap(set(editModel.replaceEdits('hi')), REPLACE_EDITS)
예제 #7
0
def main():
    """Trains all of the language models and tests them on the dev data. Change devPath if you
     wish to do things like test on the training data."""

    # load training data
    trainPath = 'data/tagged-train.dat'
    trainingCorpus = Corpus(trainPath)

    # load dev data
    devPath = 'data/tagged-dev.dat'
    devCorpus = Corpus(devPath)

    print 'Unigram Language Model: '
    unigramLM = UnigramModel(trainingCorpus)
    unigramSpell = SpellCorrect(unigramLM, trainingCorpus)
    unigramOutcome = unigramSpell.evaluate(devCorpus)
    print str(unigramOutcome)

    print 'Uniform Language Model: '
    uniformLM = UniformModel(trainingCorpus)
    uniformSpell = SpellCorrect(uniformLM, trainingCorpus)
    uniformOutcome = uniformSpell.evaluate(devCorpus)
    print str(uniformOutcome)

    print 'Smooth Unigram Language Model: '
    smoothUnigramLM = SmoothUnigramModel(trainingCorpus)
    smoothUnigramSpell = SpellCorrect(smoothUnigramLM, trainingCorpus)
    smoothUnigramOutcome = smoothUnigramSpell.evaluate(devCorpus)
    print str(smoothUnigramOutcome)

    print 'Smooth Bigram Language Model: '
    smoothBigramLM = SmoothBigramModel(trainingCorpus)
    smoothBigramSpell = SpellCorrect(smoothBigramLM, trainingCorpus)
    smoothBigramOutcome = smoothBigramSpell.evaluate(devCorpus)
    print str(smoothBigramOutcome)

    print 'Backoff Language Model: '
    backoffLM = BackoffModel(trainingCorpus)
    backoffSpell = SpellCorrect(backoffLM, trainingCorpus)
    backoffOutcome = backoffSpell.evaluate(devCorpus)
    print str(backoffOutcome)

    print 'Custom Language Model: '
    customLM = CustomModel(trainingCorpus)
    customSpell = SpellCorrect(customLM, trainingCorpus)
    customOutcome = customSpell.evaluate(devCorpus)
    print str(customOutcome)
예제 #8
0
def verificarPlagioTimeProfile(diretorioCorpus, diretorioDocumento, limiar):
    '''
    Input: endereço para pasta contendo arquivos, endereço para um arquivo, limiar de contenção para verificação.
    Conta o tempo de instanciação de um objeto corpus, um objeto documento, e da verificação de plágio do documento em relação ao corpus.
    '''
    c = Corpus(diretorioCorpus)
    doc = Documento(diretorioDocumento)
    return c.verificarPlagio(doc, limiar)
예제 #9
0
    def __init__(self, clustered_corpus):
        self.corpora = []
        for cluster in clustered_corpus:
            corpus = Corpus(cluster)
            self.corpora.append(corpus)
        if len(self.corpora) < 2:
            raise ValueError("clustered_corpus argument is not clustered")

        self.candidate_to_cu_mapping = self.calculate_cus_for()
예제 #10
0
def create_corpus():
    corpus = Corpus()
    for folder in glob.iglob('texts/*'):
        for filename in glob.iglob(folder + "/*"):
            corpus.add_document(Document(filename))
            # corpus.add_document(Document(folder))

    corpus.build_vocabulary()
    return corpus
예제 #11
0
def verificarPlagioMemUsageProfile(diretorioCorpus, diretorioDocumento,
                                   limiar):
    '''
    Input: endereço para pasta contendo arquivos, endereço para um arquivo, limiar de contenção para verificação.
    Calcula a memória usada pela instanciação de um objeto corpus, um objeto documento, e da verificação de plágio do documento em relação ao corpus.
    '''
    c = Corpus(diretorioCorpus)
    doc = Documento(diretorioDocumento)
    return c.verificarPlagio(doc, limiar)
 def __init__(self, verbose=False):
     print('Loading corpus ...')
     self.corpus = Corpus(verbose=verbose)
     self.corpus.create_data()
     self.X_seqs, self.y_seqs = self.corpus.X_seqs, self.corpus.y_seqs
     self.seq_count = len(self.X_seqs)
     for seq_idx in range(self.seq_count):
         assert (len(self.X_seqs[seq_idx]) == len(self.y_seqs[seq_idx]))
     self.feature_dim = len(self.X_seqs[0][0])
예제 #13
0
def main():

    if len(sys.argv) != 3:
        print "Please provide paths to train and test corpora!"
    else:

        training_corpus = Corpus(sys.argv[1])
        test_corpus = Corpus(sys.argv[2])
        len_pos_train = len(training_corpus.generate_pos_pairs())
        len_neg_train = len(training_corpus.generate_neg_pairs())
        training_corpus.create_mallet_file("training_file_mallet.txt")

        len_test = len(test_corpus.generate_pos_pairs()) + len(
            test_corpus.generate_neg_pairs())
        test_corpus.create_test_file("test_file_mallet.txt")

        print "There are " + str(
            len_pos_train) + " positive training instances and " + str(
                len_neg_train) + " negative training instances."
        print "There are " + str(len_test) + " test instances."
예제 #14
0
def test():
    c1 = Corpus("Big round boulder. That is a round snake.")
    c2 = Corpus("The dog is fat. The dog eats food. My dog is yellow. Your cat is yellow.")
    c3 = Corpus("Look out! Look behind you. Are you there? Are you okay? To you, I defer.")

    m1 = MarkovDict(c1)
    m2 = MarkovDict(c2, 2)
    m3 = MarkovDict(c3)

    print ("m1:", m1.response())
    print ("m1:", m1.response())
    print ("m1:", m1.response())

    print ("m2:", m2.response())
    print ("m2:", m2.response())
    print ("m2:", m2.response())

    print ("m3:", m3.response())
    print ("m3:", m3.response())
    print ("m3:", m3.response())
예제 #15
0
 def __init__(self,
              text,
              keywords=None,
              remove_stopword=True,
              with_segs=False):
     self.text = text
     self.corpus = Corpus(text,
                          keywords=keywords,
                          remove_stopword=remove_stopword,
                          with_segs=with_segs)
     self.network = nx.Graph()
     self.build_network()
예제 #16
0
def main():
    def get_data():
        client = pymongo.MongoClient()
        db = client.twitter4
        cursor = db.stream.aggregate([
            {'$match': {
                'date': {
                    '$gt': datetime.datetime(2015, 11, 13)
                }
            }},
            {'$sort': {'date': 1}},
            {'$project': {'text': 1, 'date': 1}},
        ])
        return cursor

    def get_remote_data():
        client = pymongo.MongoClient(host='59.77.134.176')
        db = client.twitter3
        cursor = db.stream.aggregate([
            # {'$sort': {'date': 1}},
            {'$project': {'text': 1}},
        ])
        return cursor

    cursor = get_data()
    print 'calculate_entropy 多个词只算1次'
    olda = None
    reallen = 0
    # for chunk_no, doc_chunk in enumerate(cursor_serial(cursor, 3000)):
    for chunk_no, doc_chunk in enumerate(chunkize_serial(cursor, 3000, as_numpy=False)):
        print doc_chunk[0]['date']
        doc_chunk = [tweet['text'] for tweet in doc_chunk]

        reallen += len(doc_chunk)

        print chunk_no, reallen - len(doc_chunk), reallen, len(doc_chunk), 'lda'
        start = datetime.datetime.now()
        if not olda:
            corpus = Corpus(doc_chunk)
            olda = OnlineLDA(corpus, K=10)
        else:
            olda.fit(doc_chunk)
        # Give them to online LDA

        print datetime.datetime.now() - start
        with codecs.open(r'G:\test18.out', "w", "utf-8-sig") as f:
            for topic_id, (topic_likelihood, topic_words, topic_tweets) in olda.get_lda_info():
                print '{}%\t{}'.format(round(topic_likelihood * 100, 2), topic_words)
                print '\t', topic_tweets
                f.write(topic_tweets + '\n')

        print '\n\n\n\n\n\n'
예제 #17
0
 def __init__(self,
              text,
              keywords=None,
              remove_stopword=True,
              with_segs=False,
              weight_type='count'):
     self.text = text
     self.corpus = Corpus(text,
                          keywords=keywords,
                          remove_stopword=remove_stopword,
                          with_segs=with_segs)
     self.network = nx.Graph()
     self._network(weight_type)
예제 #18
0
 def setup_corpus(self, theme, nb_docs):
     #contient les mots filtrés
     self.WORDS = theme + ";"
     #les 3 métriques de centralités (pour chaque mot : dictionnaire)
     self.DEGCEN = {}
     self.CLOCEN = {}
     self.BETCEN = {}
     #le thème du corpus
     self.THEME = theme
     #nombre de documents du corpus
     self.NB_DOCS = nb_docs
     #le corpus
     self.corpus = Corpus(theme)
     self.corpus.download_collection(nb_docs, keyword=theme)
     self.A = self.corpus.get_adjacency_matrix()
예제 #19
0
 def train(self, numIterations=100, testCorpusPath=None):
     if testCorpusPath:
         testCorpus = Corpus(testCorpusPath)
     for i in range(1, numIterations + 1):
         self.algorithm.train()  # call train method from algorithm
         if i % 10 == 0:
             # trainEval = Evaluation(self.algorithm.corpus)
             # print "Training evaluation for", i, "iteration(s):\n", trainEval.format()
             # self.algorithm.corpus.resetSentStats()
             if testCorpusPath:
                 self.setPredictedTags(testCorpus)
                 testEval = Evaluation(testCorpus)
                 print "Testing evaluation for", i, "iteration(s):\n", testEval.format(
                 )
                 testCorpus.resetSentStats(
                 )  # !!! we can use prototype pattern(so we don't need to loop through sents): here testCorpus = testCorpus.getPrototype() and in Corpus::__init__ : self.prototype = self (google : python prototype)?
예제 #20
0
파일: main.py 프로젝트: jeremyjs/markov
def test():
    print("Creating new markov dict...")
    print(getDepth())
    corpus_path = "./corpora/test/depth.txt"
    corpus = Corpus(importStrFromFile(corpus_path))
    print(corpus)
    reverse_corpus = list(reversed(corpus))
    print(reverse_corpus)
    forward_markov_dict = MarkovDict(source=corpus, depth=getDepth())
    reverse_markov_dict = MarkovDict(source=reverse_corpus, depth=getDepth())
    pprint(forward_markov_dict.dict)
    pprint(reverse_markov_dict.dict)
    bot = MarkovBot(forward_markov_dict, reverse_markov_dict)
    # pprint(bot.forward_dict.dict)
    # pprint(bot.reverse_dict.dict)
    print(bot.response(topic="markov"))
예제 #21
0
파일: main.py 프로젝트: jeremyjs/markov
def main():
    print("Creating new markov dict...")
    forward_markov_dict = MarkovDict(source=None, depth=getDepth())
    reverse_markov_dict = MarkovDict(source=None, depth=getDepth())
    print("Starting for loop to add corpora...")
    for corpus_path in corporaPaths():
        corpus = Corpus(importStrFromFile(corpus_path))
        print("Adding corpus with path '" + corpus_path + "'...")
        forward_markov_dict.add(corpus)
        reverse_markov_dict.add(list(reversed(corpus)))
    print("Initializing MarkovBot...")
    bot = MarkovBot(forward_markov_dict, reverse_markov_dict)
    print("\nWelcome to MarkovBot! Type a message. Type 'exit()' to quit.")
    message = prompt()
    while message != "exit()":
        print(bot.response(topic=message.split()[0]))
        message = prompt()
예제 #22
0
파일: main.py 프로젝트: xuanvinhln/abc
def main():
    if (len(sys.argv) != 6):
        print "usage: python main.py <init_alpha> <modeldir_name> <num_topic> <data_file> <random/load>"
        sys.exit(1)

    init_alpha = float(sys.argv[1])
    directory = sys.argv[2]
    num_topics = int(sys.argv[3])
    data_file = sys.argv[4]
    start_type = sys.argv[5]

    # read_data
    corpus = Corpus()
    corpus.read_data(data_file)

    # Run LDA
    LdaEstimator.run_EM(init_alpha, directory, num_topics, corpus, start_type)
def main():
    args = config()
    ns = args.ns  # value for ns
    part = args.part  # part or full
    ng_small = args.ng_small  # ngram_smallest_value
    ng_big = args.ng_big  # ngram_biggest_value
    use_subsample = args.subsample  # use_subsample or not

    corpus = Corpus(part, ng_small, ng_big, use_subsample)
    emb, _ = word2vec_trainer(corpus,
                              ns=ns,
                              dimension=64,
                              learning_rate=0.05,
                              iteration=50000)
    # Print similar words
    testwords = [
        "narrow-mindedness", "department", "campfires", "knowing", "urbanize",
        "imperfection", "principality", "abnormal", "secondary", "ungraceful"
    ]
    sim(testwords, corpus, emb)
예제 #24
0
    def __init__(self, data_path, corpus_file):
        """
		WorbEmb class init

		Parameters
		----------
		data_path : str
			data full path
		corpus_file : str
			protein domain corpus file name

		Returns
		-------
		None
		"""
        self.data_path = data_path
        self.corpus_file = corpus_file
        self.Corpus = Corpus(self.data_path, self.corpus_file)
        self.w2v_model = "none"
        self.w2v_file_out = ""
예제 #25
0
def downloadCorpus(snapshotDir, corpusDir, projectName, configInfo):

    # 2. Dump the snapshots for a project
    msg = '---------------------------------------------------- \n'
    msg += ' Dump the corpus for project %s \n' % projectName
    msg += '---------------------------------------------------- \n'
    print(msg)

    project_snapshot_dir = os.path.join(snapshotDir, projectName)
    project_corpus_dir = os.path.join(corpusDir, projectName)

    if os.path.isdir(project_corpus_dir):
        print "!! %s already exists...returning \n" % project_corpus_dir
        #return

    corpus = Corpus(project_snapshot_dir, 'java', project_corpus_dir,
                    configInfo)
    #logging.debug(corpus)
    #print corpus
    corpus.dump()
예제 #26
0
def create_gannt_corpus_obj():
    corpus = Corpus()
    doc_id = 0

    source_file_directory = 'GANNT/high/'
    target_file_directory = 'GANNT/low/'

    # Gets a list of the source document file names and sort by ID number
    source_file_names = [
        f for f in listdir(source_file_directory)
        if isfile(join(source_file_directory, f))
    ]
    source_file_names.sort()

    # Iterates through the source document files and stores it's contents and names in document dictionary
    # Dictionary's key is an id number that increments with each document.
    for fileName in source_file_names:
        doc_name = fileName.rstrip('.txt')
        temp_document = Document(
            doc_id, doc_name,
            open(source_file_directory + fileName, 'r').read().rstrip("\n"))
        corpus.add_source_document(temp_document)
        doc_id += 1

    # Gets a list of the target document file names and sort by ID number
    target_file_names = [
        f for f in listdir(target_file_directory)
        if isfile(join(target_file_directory, f))
    ]
    target_file_names.sort()

    # Get and add to
    for fileName in target_file_names:
        doc_name = fileName.rstrip('.txt')
        temp_document = Document(
            doc_id, doc_name,
            open(target_file_directory + fileName, 'r').read().rstrip("\n"))
        corpus.add_target_document(temp_document)
        doc_id += 1

    return corpus
예제 #27
0
def main():
    clustered_corpus_path = 'clustered_corpus'
    clustered_corpus = read_clustered_corpus(clustered_corpus_path)
    corpus = merge_clustered_corpus_into_a_single_corpus(clustered_corpus)

    target_file_path = 'target.txt'
    text = read_text_file(target_file_path)
    document = Document(text)

    corpus = Corpus(corpus)
    clustered_corpus = ClusteredCorpus(clustered_corpus)

    candidate_to_rank_mapping = {}
    candidate_to_params_mapping = {}
    candidate_to_dfs_in_each_cluster_mapping = {}

    for candidate in document.get_candidates():
        tf = math.log(1.0 + document.get_tf_for(candidate), 10.0)
        # tf = document.get_tf_for(candidate)
        idf = math.log(1.0 + 1.0 / corpus.get_df_for(candidate), 2.0)
        cu = clustered_corpus.get_cu_for(candidate)

        rank = cu
        # rank = tf * cu
        # rank = tf * idf

        dfs_in_each_cluster = clustered_corpus.get_dfs_in_each_cluster_for(candidate)

        candidate_representative = corpus.get_representative_for(candidate)
        candidate_to_rank_mapping[candidate_representative] = rank
        candidate_to_params_mapping[candidate_representative] = (tf, idf, cu)
        candidate_to_dfs_in_each_cluster_mapping[candidate_representative] = dfs_in_each_cluster

    table = generate_table_based_on(
        candidate_to_rank_mapping,
        candidate_to_params_mapping,
        candidate_to_dfs_in_each_cluster_mapping
    )

    save_as_file(table)
    print('Done.')
예제 #28
0
def main(args):
    if args[1] == 'repair':
        if args[2] == 'all':
            datasets = ['be5', 'dagger', 'milo', 'okhttp', 'picasso']
        else:
            datasets = args[2:]
        for dataset in tqdm(datasets):
            repair_real(dataset)
    if args[1] == 'gen_training_data':
        project_path = args[2]
        checkstyle_file_path = args[3]
        project_name = args[4]
        corpus_dir = create_corpus(
            project_path,
            project_name,
            checkstyle_file_path
        )
        corpus = Corpus(corpus_dir, project_name)
        share = { key:core_config['DATASHARE'].getint(key) for key in ['learning', 'validation', 'testing'] }
        synthetic.gen_dataset(corpus, share, target_dir=f'./styler/{project_name}-errors' )
        ml.gen_IO(f'./styler/{project_name}-errors', f'./styler/{project_name}-tokens', only_formatting=True)
        pass
예제 #29
0
def create_icebreaker_corpus_obj():
    source_file = 'IceBreaker/Requirements.csv'
    target_file = 'IceBreaker/ClassDiagram.csv'
    corpus = Corpus()
    index = 0
    with open(source_file) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')

        for name, desc in csv_reader:
            doc = Document(index, name, desc)
            corpus.add_source_document(doc)
        csv_file.close()

    with open(target_file) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')

        for name, desc in csv_reader:
            doc = Document(index, name, desc)
            corpus.add_target_document(doc)
        csv_file.close()

    return corpus
예제 #30
0
    def get_description(self):
        '''
        This function gets all the url, finds their description text
        and update them to the database
        '''
        #get doc_id
        self.mycursor.execute("select id,url from doc")
        myresult = self.mycursor.fetchall()
        for doc_id, url in myresult:
            #print("**********Doc ID is "+str(doc_id)+" ********")
            c = Corpus()
            name = c.url_to_dir(url)
            #print("Name is "+ name)
            with open(name, "rb") as file:
                content = file.read()
                soup = BeautifulSoup(content, "lxml")
                metas = soup.find_all("meta")
                result = ''
                for meta in metas:
                    if ('content' in meta.attrs) and ('name' in meta.attrs) and \
                       ((meta.attrs['name'] == 'description') or (meta.attrs['name'] == 'keywords')):
                        result = " ".join(meta.attrs['content'].split())

                #if html doesn't have description tag
                if result == '':
                    script = soup.find(
                        ["h1", "h2", "h3", "h4", "h5", "strong", "title", "b"])
                    if script:
                        temp = " ".join(script.text.split())
                        result += temp if len(temp) < 200 else ""
                print(result)
                i_sql = "update doc set description =%s where id = %s"
                i_val = (result, doc_id)
                self.mycursor.execute(i_sql, i_val)
                self.mydb.commit()
                print(self.mycursor.rowcount,
                      "was inserted in DOC , DOC ID IS " + str(doc_id))