def precision_max_multiple_references(system, references, n):
    '''
    given:
    word ww from system output
    reference sentences r1 r2 r3 r4
    ww occurrs o1 times in r1 o2 in r2 o3 in r3 o4 in r4
    find max of o1 o2 o3 o4
    sum over all ww
    divide by number of words in system output
    '''
    max_word_count = Counter()
    p = 0.0
    ng_sys = ngrams(system, n)
    for ww in ng_sys:
        counts = Counter()
        for rr in references:
            ng_refs = ngrams(rr.split(), n)
            if ww in ng_refs:
                counts[rr] = ng_refs[ww]
            else:
                counts[rr] = 0

        max_word_count[ww] = max(counts.values())

    return float(sum(max_word_count.values())) / float(len(system))
Ejemplo n.º 2
0
def run_svm(mode='r',iterations=2):
    # setup work (generate all the ngrams if they don't exist yet)
    import os
    if not os.path.isfile('pos_%sgram.dump' % n):
        gen_all_ngrams()
    
    ind = Indexes(mode,iterations)
    acc = (0,0,0)
    # run svm
    for i in range(iterations):
        ind.next()
        (train, gramsdict) = training_set(ind,n=n)
        classifier = LinearSVMClassifier(data.Data(numpy.array(train, dtype=numpy.uint16).T))
        j = ind.get_pos_test_ind()[0]
        pos = os.listdir("pos")
        test = ngrams.grams_to_featurevector(gramsdict, ngrams.ngrams(n, open("pos/"+pos[j]).read()), label=None)
        print classifier.classify(test, dtype=numpy.uint16)
        neg = os.listdir("neg")
        j = ind.get_neg_test_ind()[0]
        test = ngrams.grams_to_featurevector(gramsdict, ngrams.ngrams(n, open("neg/"+neg[j]).read()), label=None)
        print classifier.classify(test, dtype=numpy.uint16)
        print m[-2]
        print m[-1]
        # p = test_model(m,ind,n=n)
        # nresults = len(p)
        # acc = [(a+b) for (a,b) in zip(acc,get_accuracy(p))]
    print acc
    
    return (m,p)
Ejemplo n.º 3
0
def training_model(ind, n=3):
    print "Loading features"
    load_features(n, fmap)
    print "Feature map size: %s" % fmap.getSize()
    print "Getting training data"
    train = []
    for i in ind.get_pos_train_ind():
        item = os.listdir("pos")[i]
        train.append(
            (1, [(fmap.getID(item[0]), item[1])
                 for item in ngrams.ngrams(n,
                                           open("pos/" + item).read()).items()
                 if fmap.hasFeature(item[0])]))
    for i in ind.get_neg_train_ind():
        item = os.listdir("neg")[i]
        train.append((-1, [
            (fmap.getID(item[0]), item[1])
            for item in ngrams.ngrams(n,
                                      open("neg/" + item).read()).items()
            if fmap.hasFeature(item[0])
        ]))
    print "Training model"
    model = svmlight.learn(train, type='classification', verbosity=0)
    svmlight.write_model(model, 'my_model.dat')
    return model
Ejemplo n.º 4
0
def keywords(passage):
    # List words and make all singular nouns
    word = []
    words = re.findall(r'\w+', passage)
    ini_tot_words = len(words)
    for w in words:
        if w=='000': w='THOUSAND' # Future work: generalize!
        if w !='' and len(w) >= 2: 
           if inflect.singular_noun(w) is False:
              word.append(w)   
              continue
           else:
              s = inflect.singular_noun(w)
              word.append(s)
              
    tot_words = len(word)
       
    # Count words and select the n-most repeated ones   
    word_counts = Counter(word)
    key_word_1 = word_counts.most_common(20)        # The n-most common single key-word
                                                    # excluding words shorter than 3 characters                                      
    all_2key_words = Counter(ngrams(word, 2))
    key_words_2 = all_2key_words.most_common(20)    # The n-most common bigram (double key-word)
                                                    # excluding words shorter than 3 characters           
    all_3key_words = Counter(ngrams(word, 3))
    key_words_3 = all_3key_words.most_common(20)    # The n-most common trigram (triple key-word)
                                                    # excluding words shorter than 3 characters                                         
    return(ini_tot_words, tot_words, key_word_1, key_words_2, key_words_3)
                                           
Ejemplo n.º 5
0
def test_model(model,ind,n=3):
    test = []
    for i in ind.get_pos_train_ind():
        item = os.listdir("pos")[i]
        test.append((1,[(fmap.getID(item[0]),item[1]) for item in ngrams.ngrams(n, open("pos/"+item).read()).items() if fmap.hasFeature(item[0])]))
    for i in ind.get_neg_test_ind():
        item = os.listdir("neg")[i]
        test.append((-1,[(fmap.getID(item[0]),item[1]) for item in ngrams.ngrams(n, open("neg/"+item).read()).items() if fmap.hasFeature(item[0])]))
    predictions = svmlight.classify(model, test)
    return predictions
Ejemplo n.º 6
0
    def test(self):
        if self.test_set:
            for s in range(1, 6):
                self.test_dir = select_extradata(self.test_set, s)
                print "Testing with %s" % self.test_dir
                test_files = os.listdir(self.test_dir)
                ntest = len(test_files)
                tests = [{} for i in range(ntest)]
                for i in range(ntest):
                    for j in self.n:
                        tests[i].update(
                            ngrams.ngrams(
                                j,
                                open("%s/%s" %
                                     (self.test_dir, test_files[i])).read(),
                                self.negation))
                results = [
                    self.classifier.classify(i, binary=self.binary)
                    for i in tests
                ]
                correct = len([i for i in results if int(i) == 1])
                print "%s Stars, Positive: %s of %s, %s accuracy" % (
                    s, correct, len(tests), (float(correct) / len(tests)))
            return (0, 0)  # return dummy values when testing on external data

        pos_tests = [{} for f in self.pos_test_data]
        neg_tests = [{} for f in self.neg_test_data]

        # Testset --> Feature Vectors
        for j in self.n:
            for i in range(len(self.pos_test_data)):
                pos_tests[i].update(
                    ngrams.ngrams(j, self.pos_test_data[i], self.negation))
            for i in range(len(self.neg_test_data)):
                neg_tests[i].update(
                    ngrams.ngrams(j, self.neg_test_data[i], self.negation))

        # Testing
        pos_results = [
            self.classifier.classify(i, binary=self.binary) for i in pos_tests
        ]
        pos_correct = len([i for i in pos_results if int(i) == 1])
        print "Positive: %s of %s, %s accuracy" % (pos_correct, len(pos_tests),
                                                   (float(pos_correct) /
                                                    len(pos_tests)))
        neg_results = [
            self.classifier.classify(i, binary=self.binary) for i in neg_tests
        ]
        neg_correct = len([i for i in neg_results if int(i) == -1])
        print "Negative: %s of %s, %s accuracy" % (neg_correct, len(neg_tests),
                                                   (float(neg_correct) /
                                                    len(neg_tests)))
        return (float(pos_correct) / len(pos_tests),
                float(neg_correct) / len(neg_tests))
Ejemplo n.º 7
0
def precision(system, reference, n):
    counts = Counter()
    p = 0.0
    ng_sys = ngrams(system, n)
    for ss in ng_sys:
        ng_refs = ngrams(reference, n)
        if ss in ng_refs:
            counts[ss] += ng_refs[ss]

    for ng in counts:
        p += float(counts[ng]) / float(len(system))
    return p
Ejemplo n.º 8
0
def training_set(ind,n=3):
    """
    
    Caution: Do not use 0 as label because it evaluates to False
    """
    pos = os.listdir("pos")
    feature_vectors = [ngrams.ngrams(n, open("pos/"+pos[i]).read()) for i in ind.get_pos_train_ind()]
    labels = [1 for i in ind.get_pos_train_ind()]
    neg = os.listdir("neg")
    feature_vectors.extend([ngrams.ngrams(n, open("neg/"+neg[i]).read()) for i in ind.get_neg_train_ind()])
    labels.extend([2 for i in ind.get_neg_train_ind()])
    (matrix, gramsdict) = ngrams.ngrams_to_matrix(feature_vectors, labels, return_gramsdict=True)
    return (matrix.asMatrix(), gramsdict)
def precision_multiple_references(system, references, n):
    counts = Counter()
    p = 0.0
    ng_sys = ngrams(system, n)
    for ss in ng_sys:
        for rr in references:
            ng_refs = ngrams(rr.split(), n)
            if ss in ng_refs:
                counts[ss] += ng_refs[ss]

    print counts
    for ng in counts:
        p += float(counts[ng]) / float(len(system))
    return p
Ejemplo n.º 10
0
    def perplexity(self, corpus):
        """
        Compute per-token perplexity. The cross-entropy of p with respect
        to q is defined as: 

        H(p, q) = \sum_x p(x) log_2 q(x)

        In the case of language modeling, q is the estimated probability
        distribution and p is the (MLE) probability of each n-gram in the 
        held-out data. Then, the perplexity of p w.r.t. q is simply

        PPX(p, q) = 2^{H / Z}

        where Z is the number of tokens in p.
        """
        # frequency of observations over corpus
        fx = defaultdict(int)
        # number of observations in corpus
        Z = 0
        for (prefix, suffix) in ngrams(corpus, self.order):
            Z += 1
            fx[(prefix, suffix)] += 1
        # collect entropy
        H = BitWeight(1.)
        for ((prefix, suffix), f_x) in fx.iteritems():
            H *= f_x * self.prob[prefix][suffix]
        # divide to get per-token entropy, exponentiate to get perplexity
        return 2 ** (H.bw / Z)    
Ejemplo n.º 11
0
 def _compute_prob(self, corpus):
     """
     Compute a conditional frquency distribution and normalize it into
     BitWeight probabilities
     """
     # conditional frequency distribution representation
     self.freq = defaultdict(lambda: defaultdict(int))
     # populate it
     for (prefix, suffix) in ngrams(corpus, self.order):
         self.freq[prefix][suffix] += 1
     # initialize log2 probability distribution with default value (0)
     self.prob = defaultdict(lambda: defaultdict(lambda: BitWeight()))
     # populate seen probabilities
     for (prefix, suffixes) in self.freq.iteritems():
         # prefix is a tuple of tokens, sffixes is a dictionary
         # containing token: count key/value pairs
         denominator = BitWeight(sum(suffixes.values()))
         # optimize dictionary lookup; both of these are pointers, so
         # modifying pdist (as we will) modifies self.prob too
         fdist = self.freq[prefix]
         pdist = self.prob[prefix]
         # normalize, using fdist[suffix], the full n-gram's count
         for suffix in suffixes:
             # BitWeight class implements division as log-space
             # subtraction so as to forestall underflow
             pdist[suffix] = BitWeight(fdist[suffix]) / denominator
Ejemplo n.º 12
0
def get_ngram_overlap(translation, references):
    trans_ngrams = ngrams(translation)
    all_references = [[] for ignored in range(len(trans_ngrams))]
    for reference in references:
        ref_ngrams = ngrams(reference)
        i = 0
        while i < len(ref_ngrams):
            all_references[i].append(ref_ngrams[i])
            i += 1
    product = 1
    i = 0
    while i < len(trans_ngrams):
        matched = match_ngrams(trans_ngrams[i], all_references[i])
        product = product * (len(matched) / len(trans_ngrams[i]))
        i += 1
    return product**0.25
Ejemplo n.º 13
0
def analyse_cable(cable, conn):
    global NGRAM_INSERT, LOCATION_INSERT
    idx     = cable["id"]
    # Clean the data
    content = cable.get("content")
    content = cleaner.slugify(content)
    content = cleaner.stopwords(content)
    # get the ngrams
    records   = ngrams(content, n_max=3)
    cities    = geo.get_cities(content)
    countries = geo.get_countries(content)
    cur  = conn.cursor(cursor_factory=DictCursor)
    # Record ngrams, start transaction
    ngram_rows  = []
    for token, count in records.iteritems():
        row = (idx, token, count, cable['date'])
        ngram_rows.append(row)
    # Record geo location
    location_rows  = []
    # Collect city to insert
    for city in cities:
        row = (idx, city["name"].upper(), "CITY", city["countrycode"], city["latitude"], city["longitude"], cable['date'])
        location_rows.append(row)
    # Collect countries to insert
    for country in countries:
        row = (idx, country["name"].upper(), "COUNTRY", country["countrycode"], None, None, cable['date'])
        location_rows.append(row)
    try:
        cur.executemany(NGRAM_INSERT, ngram_rows)
        cur.executemany(LOCATION_INSERT, location_rows)
        conn.commit()
    except psycopg2.IntegrityError:
        # Ingnore the integrity error
        conn.rollback()
    return cable
Ejemplo n.º 14
0
def training_model(ind,n=3):
    print "Loading features"
    load_features(n,fmap)
    print "Feature map size: %s" % fmap.getSize()
    print "Getting training data"
    train = []
    for i in ind.get_pos_train_ind():
        item = os.listdir("pos")[i]
        train.append((1,[(fmap.getID(item[0]),item[1]) for item in ngrams.ngrams(n, open("pos/"+item).read()).items() if fmap.hasFeature(item[0])]))
    for i in ind.get_neg_train_ind():
        item = os.listdir("neg")[i]
        train.append((-1,[(fmap.getID(item[0]),item[1]) for item in ngrams.ngrams(n, open("neg/"+item).read()).items() if fmap.hasFeature(item[0])]))
    print "Training model"
    model = svmlight.learn(train, type='classification', verbosity=0)
    svmlight.write_model(model, 'my_model.dat')
    return model
Ejemplo n.º 15
0
def gen_ngrams(n=2, data="pos"):
    "Generate ngrams and save locally"
    temp = []
    for i in os.listdir("%s" % data):
        temp.append(open("%s/" % data + i).read())
    temp = "\n".join(temp)
    aggregate_ngrams = ngrams.ngrams(n, temp)
    pickle.dump(aggregate_ngrams, open("%s_%sgram.dump" % (data, n), 'w'))
Ejemplo n.º 16
0
def gen_ngrams(n=2,data="pos"):
    "Generate ngrams and save locally"
    temp = []
    for i in os.listdir("%s" % data):
        temp.append(open("%s/" % data + i).read())
    temp = "\n".join(temp)
    aggregate_ngrams = ngrams.ngrams(n, temp)
    pickle.dump(aggregate_ngrams, open("%s_%sgram.dump" % (data,n),'w'))
Ejemplo n.º 17
0
    def train(self):
        pos_train = [{} for f in self.pos_train_data]
        neg_train = [{} for f in self.neg_train_data]

        # Reading files
        for (j, lim) in zip(self.n, self.limit):
            all_grams = [
                ngrams.ngrams(j, f, self.negation) for f in self.pos_train_data
            ]
            for i in range(len(self.pos_train_data)):
                pos_train[i].update(all_grams[i])
            featureslist = all_grams

            all_grams = [
                ngrams.ngrams(j, f, self.negation) for f in self.neg_train_data
            ]
            for i in range(len(self.neg_train_data)):
                neg_train[i].update(all_grams[i])
            featureslist.extend(all_grams)

            # Collapsing, limiting ngrams
            self.features.update(
                ngrams.top_ngrams(ngrams.collapse_ngrams(featureslist), lim))

        # Creating Index
        self.classifier = self.clsf(restrictFeatures=self.features)
        print "# features: %s" % self.classifier.nfeatures

        if self.idf:
            print "Using TF-IDF"
            idf = ngrams.ngrams_to_idf(pos_train + neg_train)
            for i in range(len(pos_train)):
                for j in pos_train[i]:
                    pos_train[i][j] = pos_train[i][j] * idf[j]
            for i in range(len(neg_train)):
                for j in neg_train[i]:
                    neg_train[i][j] = neg_train[i][j] * idf[j]

        # Making classifier
        for i in pos_train:
            self.count += 1
            self.classifier.addFeatureVector(i, 1, binary=self.binary)
        for i in neg_train:
            self.classifier.addFeatureVector(i, -1, binary=self.binary)
        self.classifier.compile()
Ejemplo n.º 18
0
    def test(self):
        pos_test_votes = False
        neg_test_votes = False
        for t in self.testers:
            pos_tests = [{} for f in t.pos_test_data]
            neg_tests = [{} for f in t.neg_test_data]
            for j in t.n:
                for i in range(len(t.pos_test_data)):
                    pos_tests[i].update(
                        ngrams.ngrams(j, t.pos_test_data[i], self.negation))
                for i in range(len(t.neg_test_data)):
                    neg_tests[i].update(
                        ngrams.ngrams(j, t.neg_test_data[i], self.negation))
            pos_results = [t.classifier.classify(i) for i in pos_tests]
            neg_results = [t.classifier.classify(i) for i in neg_tests]
            if not pos_test_votes:
                pos_test_votes = pos_results
            else:
                for i in range(len(pos_test_votes)):
                    pos_test_votes[i] += pos_results[i]
            if not neg_test_votes:
                neg_test_votes = neg_results
            else:
                for i in range(len(neg_test_votes)):
                    neg_test_votes[i] += neg_results[i]
        pos_correct = 0
        neg_correct = 0
        for i in pos_test_votes:
            if i > 0:
                pos_correct += 1
        for i in neg_test_votes:
            if i < 0:
                neg_correct += 1

        print "Positive: %s of %s, %s accuracy" % (
            pos_correct, len(pos_test_votes),
            (float(pos_correct) / len(pos_test_votes)))

        print "Negative: %s of %s, %s accuracy" % (
            neg_correct, len(neg_test_votes),
            (float(neg_correct) / len(neg_test_votes)))
        return (float(pos_correct) / len(pos_test_votes),
                float(neg_correct) / len(neg_test_votes))
Ejemplo n.º 19
0
    def substitution_score_bind(individual):
        decoded_msg = substitution_cipher(msg, [dict(zip(substitution, alphabet)) for substitution in individual])
        decoded_msg = ngrams(decoded_msg, ngram_size)
        ngrams_frequency = calculate_frequency_norm(decoded_msg)

        score = 0
        for ngram in ngrams_frequency:
            score += eng_frequency.get(ngram, 0) * ngrams_frequency[ngram]

        return score
def all_ngrams_multiple_references(n, references):
    '''
    count total ngrams
    '''
    ngram_count = 0
    for rr in references:
        ng_refs = ngrams(rr.split(), n)
        ngram_count += len(ng_refs)

    return ngram_count
Ejemplo n.º 21
0
def test_model(model, ind, n=3):
    test = []
    for i in ind.get_pos_train_ind():
        item = os.listdir("pos")[i]
        test.append(
            (1, [(fmap.getID(item[0]), item[1])
                 for item in ngrams.ngrams(n,
                                           open("pos/" + item).read()).items()
                 if fmap.hasFeature(item[0])]))
    for i in ind.get_neg_test_ind():
        item = os.listdir("neg")[i]
        test.append((-1, [
            (fmap.getID(item[0]), item[1])
            for item in ngrams.ngrams(n,
                                      open("neg/" + item).read()).items()
            if fmap.hasFeature(item[0])
        ]))
    predictions = svmlight.classify(model, test)
    return predictions
Ejemplo n.º 22
0
    def train(self):
        pos_train = [{} for f in self.pos_train_data]
        neg_train = [{} for f in self.neg_train_data]
        
        # Reading files
        for (j,lim) in zip(self.n,self.limit):
            all_grams = [ngrams.ngrams(j, f, self.negation) for f in self.pos_train_data]
            for i in range(len(self.pos_train_data)):
                pos_train[i].update(all_grams[i])
            featureslist = all_grams

            all_grams = [ngrams.ngrams(j, f, self.negation) for f in self.neg_train_data]
            for i in range(len(self.neg_train_data)):
                neg_train[i].update(all_grams[i])
            featureslist.extend(all_grams)

            # Collapsing, limiting ngrams
            self.features.update(ngrams.top_ngrams(ngrams.collapse_ngrams(
                        featureslist),lim))

        # Creating Index
        self.classifier = self.clsf(restrictFeatures = self.features)
        print "# features: %s" % self.classifier.nfeatures
        
        if self.idf:
            print "Using TF-IDF"
            idf = ngrams.ngrams_to_idf(pos_train + neg_train)
            for i in range(len(pos_train)):
                for j in pos_train[i]:
                    pos_train[i][j] = pos_train[i][j] * idf[j]
            for i in range(len(neg_train)):
                for j in neg_train[i]:
                    neg_train[i][j] = neg_train[i][j] * idf[j]
                            
        # Making classifier
        for i in pos_train:
            self.count += 1
            self.classifier.addFeatureVector(i, 1, binary=self.binary)
        for i in neg_train:
            self.classifier.addFeatureVector(i, -1, binary=self.binary)
        self.classifier.compile()
Ejemplo n.º 23
0
    def substitution_score_bind(substitution):
        decoded_msg = substitution_cipher(
            msg, [dict(zip(substitution['alphabet'], alphabet))])
        decoded_msg = ngrams(decoded_msg, ngram_size)
        ngrams_frequency = calculate_frequency_norm(decoded_msg)

        for ngram in ngrams_frequency:
            if substitution['score'] is None:
                substitution['score'] = 0
            substitution['score'] += eng_frequency.get(
                ngram, 0) * ngrams_frequency[ngram]

        return substitution
Ejemplo n.º 24
0
def run_svm(mode='r', iterations=2):
    # setup work (generate all the ngrams if they don't exist yet)
    import os
    if not os.path.isfile('pos_%sgram.dump' % n):
        gen_all_ngrams()

    ind = Indexes(mode, iterations)
    acc = (0, 0, 0)
    # run svm
    for i in range(iterations):
        ind.next()
        (train, gramsdict) = training_set(ind, n=n)
        classifier = LinearSVMClassifier(
            data.Data(numpy.array(train, dtype=numpy.uint16).T))
        j = ind.get_pos_test_ind()[0]
        pos = os.listdir("pos")
        test = ngrams.grams_to_featurevector(gramsdict,
                                             ngrams.ngrams(
                                                 n,
                                                 open("pos/" + pos[j]).read()),
                                             label=None)
        print classifier.classify(test, dtype=numpy.uint16)
        neg = os.listdir("neg")
        j = ind.get_neg_test_ind()[0]
        test = ngrams.grams_to_featurevector(gramsdict,
                                             ngrams.ngrams(
                                                 n,
                                                 open("neg/" + neg[j]).read()),
                                             label=None)
        print classifier.classify(test, dtype=numpy.uint16)
        print m[-2]
        print m[-1]
        # p = test_model(m,ind,n=n)
        # nresults = len(p)
        # acc = [(a+b) for (a,b) in zip(acc,get_accuracy(p))]
    print acc

    return (m, p)
Ejemplo n.º 25
0
    def test(self):
        if self.test_set:
            for s in range(1,6):
                self.test_dir = select_extradata(self.test_set,s)
                print "Testing with %s" % self.test_dir
                test_files = os.listdir(self.test_dir)
                ntest = len(test_files)
                tests = [{} for i in range(ntest)]
                for i in range(ntest):
                    for j in self.n:
                        tests[i].update(ngrams.ngrams(j, open("%s/%s" % (
                            self.test_dir,test_files[i])).read(), self.negation))
                results = [self.classifier.classify(i,binary=self.binary) for i in tests]
                correct = len([i for i in results if int(i) == 1])
                print "%s Stars, Positive: %s of %s, %s accuracy" % (s,correct,len(tests),
                        (float(correct)/len(tests)))
            return (0,0) # return dummy values when testing on external data

        pos_tests = [{} for f in self.pos_test_data]
        neg_tests = [{} for f in self.neg_test_data]

        # Testset --> Feature Vectors
        for j in self.n:
            for i in range(len(self.pos_test_data)):
                pos_tests[i].update(ngrams.ngrams(j, self.pos_test_data[i], self.negation))
            for i in range(len(self.neg_test_data)):
                neg_tests[i].update(ngrams.ngrams(j, self.neg_test_data[i], self.negation))

        # Testing
        pos_results = [self.classifier.classify(i,binary=self.binary) for i in pos_tests]
        pos_correct = len([i for i in pos_results if int(i) == 1])
        print "Positive: %s of %s, %s accuracy" % (pos_correct,len(pos_tests),
                (float(pos_correct)/len(pos_tests)))
        neg_results = [self.classifier.classify(i,binary=self.binary) for i in neg_tests]
        neg_correct = len([i for i in neg_results if int(i) == -1])
        print "Negative: %s of %s, %s accuracy" % (neg_correct,len(neg_tests),
                (float(neg_correct)/len(neg_tests)))
        return (float(pos_correct)/len(pos_tests), float(neg_correct)/len(neg_tests))
Ejemplo n.º 26
0
    def test(self):
        pos_test_votes = False
        neg_test_votes = False
        for t in self.testers:
            pos_tests = [{} for f in t.pos_test_data]
            neg_tests = [{} for f in t.neg_test_data]
            for j in t.n:
                for i in range(len(t.pos_test_data)):
                    pos_tests[i].update(ngrams.ngrams(j, t.pos_test_data[i], self.negation))
                for i in range(len(t.neg_test_data)):
                    neg_tests[i].update(ngrams.ngrams(j, t.neg_test_data[i], self.negation))
            pos_results = [t.classifier.classify(i) for i in pos_tests]
            neg_results = [t.classifier.classify(i) for i in neg_tests]
            if not pos_test_votes:
                pos_test_votes = pos_results
            else:
                for i in range(len(pos_test_votes)):
                    pos_test_votes[i] += pos_results[i]
            if not neg_test_votes:
                neg_test_votes = neg_results
            else:
                for i in range(len(neg_test_votes)):
                    neg_test_votes[i] += neg_results[i]
        pos_correct = 0
        neg_correct = 0
        for i in pos_test_votes:
            if i > 0:
                pos_correct += 1
        for i in neg_test_votes:
            if i < 0:
                neg_correct += 1

        print "Positive: %s of %s, %s accuracy" % (pos_correct,len(pos_test_votes),
                (float(pos_correct)/len(pos_test_votes)))

        print "Negative: %s of %s, %s accuracy" % (neg_correct,len(neg_test_votes),
                (float(neg_correct)/len(neg_test_votes)))
        return (float(pos_correct)/len(pos_test_votes), float(neg_correct)/len(neg_test_votes))
Ejemplo n.º 27
0
def training_set(ind, n=3):
    """
    
    Caution: Do not use 0 as label because it evaluates to False
    """
    pos = os.listdir("pos")
    feature_vectors = [
        ngrams.ngrams(n,
                      open("pos/" + pos[i]).read())
        for i in ind.get_pos_train_ind()
    ]
    labels = [1 for i in ind.get_pos_train_ind()]
    neg = os.listdir("neg")
    feature_vectors.extend([
        ngrams.ngrams(n,
                      open("neg/" + neg[i]).read())
        for i in ind.get_neg_train_ind()
    ])
    labels.extend([2 for i in ind.get_neg_train_ind()])
    (matrix, gramsdict) = ngrams.ngrams_to_matrix(feature_vectors,
                                                  labels,
                                                  return_gramsdict=True)
    return (matrix.asMatrix(), gramsdict)
def counts_max_multiple_references(system, references, n):
    '''
    given:
    word ww from system output
    reference sentences r1 r2 r3 r4
    ww occurrs o1 times in r1 o2 in r2 o3 in r3 o4 in r4
    find max of o1 o2 o3 o4
    sum over all ww
    '''
    max_word_count = Counter()
    ng_sys = ngrams(system, n)
    for ww in ng_sys:
        counts = Counter()
        for rr in references:
            ng_refs = ngrams(rr.split(), n)
            if ww in ng_refs:
                counts[rr] = ng_refs[ww]
            else:
                counts[rr] = 0

        max_word_count[ww] = max(counts.values())

    return sum(max_word_count.values())
def addbigrams(dft,dfte,df1,selector=0,n=50):
	
	
	
	top = topwords(df1,'Clean tweet',n)
	bigrams=ngrams(df1,'Clean tweet')
	
	
	bigramsw=bigrams.bigrams
	main_domain = join(dft,'Clean tweet')
	main_domain1 = join(dfte,'Clean tweet')
	main_domain.joinall(bigramsw,2)
	main_domain1.joinall(bigramsw,2)
	
	return main_domain.df, main_domain1.df
		def getmostcommon(df,df1,n=10):
				
				main_domain = join(df,'Clean tweet')
				main_domain1 = join(df1,'Clean tweet')
				top = topwords(self.df2,'Clean tweet',n)
				bigrams=ngrams(self.df2,'Clean tweet',n)
				
				topw=top.top
				bigramsw=bigrams.bigrams
				
				main_domain.joinall(topw,1)
				main_domain.joinall(bigramsw,2)
				main_domain1.joinall(topw,1)
				main_domain1.joinall(bigramsw,2)
				return main_domain.df,main_domain1.df,
Ejemplo n.º 31
0
		def getintersection(df,selector=0,n=50):
				
				main_domain = join(df,'Clean tweet')
				
				top = topwords(df,'Clean tweet',n)
				bigrams=ngrams(df,'Clean tweet')
				
				topw=top.top
				bigramsw=bigrams.bigrams
				
				main_domain.joinall(topw,1)
				mutualwordsu= mutualinfo(main_domain.df)
				main_domain.joinall(bigramsw,2)
				mutualwordsb= mutualinfo(main_domain.df)
				mutualwordsb=[e for e in mutualwordsb if e not in mutualwordsu]
				ratiov=ratio(main_domain.df,'L')
				ratios=ratiov.getoddratios(top.top)
				dratios=list(ratios.keys())
				return topw, bigramsw, dratios,mutualwordsu,mutualwordsb
Ejemplo n.º 32
0
def sentencer(inpfile):
    in1 = open(inpfile,"r")
    content = str(in1.read())
    #txt = ' '.join(content.split())
    txt = content.split(".")
    del(txt[-1])
    root_dict = dict()
    ngram_dict = dict()
    linecounter = 1
    for i in txt:
        currentline = i.strip()
        #print "Current Line : " + currentline
        root_list = inflection.stem(currentline)
        root_dict[linecounter] = root_list
        linecounter += 1
    in1.close()
    for linenumber, rootlist in root_dict.iteritems():
        #print "Line Number : " + str(linenumber)
        #print "Number of root words = " + str(len(rootlist))
        ngram = ngrams.ngrams(rootlist, 1)
        ngram_dict[linenumber] = ngram
    return ngram_dict
Ejemplo n.º 33
0
def analyse_cable(cable, conn):
    global NGRAM_INSERT, LOCATION_INSERT
    idx = cable["id"]
    # Clean the data
    content = cable.get("content")
    content = cleaner.slugify(content)
    content = cleaner.stopwords(content)
    # get the ngrams
    records = ngrams(content, n_max=3)
    cities = geo.get_cities(content)
    countries = geo.get_countries(content)
    cur = conn.cursor(cursor_factory=DictCursor)
    # Record ngrams, start transaction
    ngram_rows = []
    for token, count in records.iteritems():
        row = (idx, token, count, cable['date'])
        ngram_rows.append(row)
    # Record geo location
    location_rows = []
    # Collect city to insert
    for city in cities:
        row = (idx, city["name"].upper(), "CITY", city["countrycode"],
               city["latitude"], city["longitude"], cable['date'])
        location_rows.append(row)
    # Collect countries to insert
    for country in countries:
        row = (idx, country["name"].upper(), "COUNTRY", country["countrycode"],
               None, None, cable['date'])
        location_rows.append(row)
    try:
        cur.executemany(NGRAM_INSERT, ngram_rows)
        cur.executemany(LOCATION_INSERT, location_rows)
        conn.commit()
    except psycopg2.IntegrityError:
        # Ingnore the integrity error
        conn.rollback()
    return cable
Ejemplo n.º 34
0
Archivo: run.py Proyecto: bandjay/NER
from EntityRelation import EntityRelation
print "candidate generation started"
ER = EntityRelation(sentences_path, full_sentence_path, pos_path,
                    full_pos_path, frequent_patterns_path, significance,
                    out_path, capitalize)
ER.extract()
print 'Candidate generation done.'
""" Graph """
os.chdir(os.path.join(src_file_path, "src"))
### step 0
from step0 import step0
step0(out_path, DataStatsFile, data_path)
""" Word counts """
os.chdir(os.path.join(src_file_path, "EntityCleaning"))
from ngrams import ngrams
ngrams(raw_text_path, intermediate_data_path)
""" RUN with either PMI or t-score because segement file will be updated based on collocation score """
""" cleaning entity mentions based on collocation measure(PMI) """
if collocation_measure == "PMI":
    os.chdir(os.path.join(src_file_path, "EntityCleaning"))
    from collocation import collocation
    collocation(Entity_file_path, intermediate_data_path, num_lines, "PMI")

if collocation_measure == "t-score":
    """ cleaning entity mentions based on collocation measure(t-score) """
    os.chdir(os.path.join(src_file_path, "EntityCleaning"))
    from collocation import collocation
    collocation(Entity_file_path, intermediate_data_path, num_lines, "t-score")
""" outlier detection in seed file by type """
os.chdir(os.path.join(src_file_path, "NoiseDetection"))
from get_seed_file import get_seed_file
Ejemplo n.º 35
0
from Indexes import Indexes
import matplotlib.pyplot as plt
from classifier import MaximumEntropyClassifier

TRAIN_SIZE = 800
n = 1

print "Maximum Entropy"
pos = os.listdir("pos")
neg = os.listdir("neg")

ind = Indexes('r',1,TRAIN_SIZE)
print "> determined Indices"
ind.next()

pos_grams = [ngrams.ngrams(n, open("pos/"+pos[i]).read()) for i in ind.get_pos_train_ind()]
pos_collapsed_grams = ngrams.top_ngrams(ngrams.collapse_ngrams(pos_grams),16165)
neg_grams = [ngrams.ngrams(n, open("neg/"+neg[i]).read()) for i in ind.get_neg_train_ind()]
neg_collapsed_grams = ngrams.top_ngrams(ngrams.collapse_ngrams(neg_grams),16165)
print "> collapsed grams"

trainingset = [([k],'pos',v) for (k,v) in pos_collapsed_grams.iteritems()]
trainingset.extend([([k],'neg',v) for (k,v) in neg_collapsed_grams.iteritems()])
m = MaximumEntropyClassifier(trainingset)
print "> created model"

pos_res = []
neg_res = []
pos_tests = [ngrams.ngrams(n, open("pos/"+pos[i]).read()) for i in ind.get_pos_test_ind()]
pos_results = [m.classify(test) for test in pos_tests]
pos_correct = len([i for i in pos_results if i >= 0.5])
Ejemplo n.º 36
0
import matplotlib.pyplot as plt
from classifier import MaximumEntropyClassifier

TRAIN_SIZE = 800
n = 1

print "Maximum Entropy"
pos = os.listdir("pos")
neg = os.listdir("neg")

ind = Indexes('r', 1, TRAIN_SIZE)
print "> determined Indices"
ind.next()

pos_grams = [
    ngrams.ngrams(n,
                  open("pos/" + pos[i]).read())
    for i in ind.get_pos_train_ind()
]
pos_collapsed_grams = ngrams.top_ngrams(ngrams.collapse_ngrams(pos_grams),
                                        16165)
neg_grams = [
    ngrams.ngrams(n,
                  open("neg/" + neg[i]).read())
    for i in ind.get_neg_train_ind()
]
neg_collapsed_grams = ngrams.top_ngrams(ngrams.collapse_ngrams(neg_grams),
                                        16165)
print "> collapsed grams"

trainingset = [([k], 'pos', v) for (k, v) in pos_collapsed_grams.iteritems()]
trainingset.extend([([k], 'neg', v)
Ejemplo n.º 37
0
def index_of_coincidence(sequence):
    sequence = ngrams(sequence, 1)
    ngrams_frequency = calculate_frequency_norm(sequence)
    return reduce(
        lambda acc, symbol: acc + (ngrams_frequency[symbol] / 100)**2,
        ngrams_frequency, 0)
Ejemplo n.º 38
0
def domain(document, crossvalidationundersampling,ArffL,A=0, undersampler=0,sentiment=0 ):
	test=pd.read_csv('documents\csv\drunk\drunkTEXT400'+'.csv'  )
	test.L=test.L.replace(['y','n'], ['True','False'])
	df1=pd.read_csv(document+'.csv'  )
	df1.L=df1.L.replace(['y','n'], ['True','False'])
	joinc=joindocuments(df1,df1)
	top = topwords(df1,'Clean tweet',100)
	main_domain = join(df1,'Clean tweet')
	
	bigrams=ngrams(df1,'Clean tweet')
	print 'bigrams'
	print bigrams.bigrams
	main_domain.joinall(bigrams.bigrams,2)
	main_domain.joinall(top.top,1)
	
	
	
	main_domain.df.to_csv('prueba.csv',index=False)
	ratiov=ratio(main_domain.df,'L')
	ratios=ratiov.getoddratios(top.top)
	print 'ratios'
	print ratios		
	ds=list(ratios.keys())
	testobject = join(test,'Clean tweet')
	oddradiojoin=join(df1,'Clean tweet')
	oddradiojoin.joinall(ds,1)
	testobject.joinall(ds,1)
	oddradiojoin.joinall(bigrams.bigrams,2)
	testobject.joinall(bigrams.bigrams,2)
	test=testobject.df
	cols=['Clean tweet']
	if sentiment==1:
		cols=['Clean tweet','sentiment_polarity', 'sentiment_subjectivity', 'absPolarity']

	try:
		for x in cols:
			del oddradiojoin.df[x]
			del test[x]
	except:
		pass
	#training, test=joinc.gettrainingandtestp(oddradiojoin.df)
	print 'matrix of elements to reduce'
	print "saul,",oddradiojoin.df.shape
	#########################################################
	if undersampler==1:
	  print "saul,",oddradiojoin.df.shape
	  oddradiojoin.df=joinc.undersampling(oddradiojoin.df)
	  print oddradiojoin.df.shape
	if A==1:
		
		
		
		dftraining, dftest=pcaf(oddradiojoin.df,test)
		oddradiojoin.df =dftraining.join(oddradiojoin.df["L"])
		
		
		test=dftest.join(test["L"])

	
	print oddradiojoin.df.shape
	training=oddradiojoin.df
	
	training=training.replace(['True','False'], [True,False])	
	test=test.replace(['True','False'], [True,False])
	training=training.astype(np.float64)
	test=test.astype(np.float64)
	training['L']=training['L'].astype(bool)
	test['L']=test['L'].astype(bool)
	A=str(A)
	sentiment=str(sentiment)
	oddradiojoin.df.to_csv('crossvalidation.csv',index=False)
	#undersampleddf1.to_csv(str(crossvalidationundersampling) +'\undersampling'+A+'.csv',index=False)
	headers_names=list(training.columns.values)
	headers_names.remove('L')
	headers_names.append('L')
	headers_names1=list(test.columns.values)
	print headers_names,'heathers test',headers_names1
	test = test[headers_names]
	training = training[headers_names]
	print 'training' +str(training.dtypes)
	test.to_csv(str(crossvalidationundersampling) + r'\test1'+A+'.csv',index=False)
	training.to_csv(str(crossvalidationundersampling) +r'\training1'+A+'.csv',index=False)
	TRAINING=training.as_matrix(columns=None)
	TEST=test.as_matrix(columns=None)
	print 'training'
	print training.dtypes
	
	arff.dump(ArffL +r'\trainingwu'+A+str(undersampler)+sentiment+'.arff',TRAINING, relation="whatever", names=headers_names)
	 
	arff.dump(ArffL +r'\testwu'+A+str(undersampler)+sentiment+'.arff',TEST, relation="whatever", names=headers_names)
Ejemplo n.º 39
0
def calculate_brevity_penalty(translation, references):
    mean_ref_len = statistics.mean([len(ngrams(ref)[0]) for ref in references])
    mean_ref_len += 0.5
    return min(1, (len(ngrams(translation)[0]) / math.floor(mean_ref_len)))
Ejemplo n.º 40
0
eng_frequency = pd.read_csv('../ngrams-frequency/letter_frequency.csv')
eng_frequency['ngram'] = eng_frequency['ngram'].map(
    lambda ng: tuple([s for s in ng]))
eng_frequency = eng_frequency.set_index('ngram')

KEY_RANGE = 256
KEY_LEN = key_len(encoded_utf8)
KEY = []
for i in range(KEY_LEN):
    msg = encoded_utf8[i::KEY_LEN]

    best_key = -1
    best_score = sys.maxsize
    for key in range(KEY_RANGE):
        decoded_msg = vigenere_cipher(msg, chr(key).encode())
        decoded_msg = ngrams(decoded_msg, 1)
        ngrams_frequency = calculate_frequency_norm(decoded_msg)

        score = 0
        for ngram in ngrams_frequency:
            if ngram not in ALPHABET:
                score = sys.maxsize
                break
            else:
                score += (ngrams_frequency[ngram] -
                          eng_frequency.get(ngram, 0))**2
        score = score**0.5
        best_score, best_key = (score,
                                key) if score < best_score else (best_score,
                                                                 best_key)
Ejemplo n.º 41
0
def domain(document,
           crossvalidationundersampling,
           ArffL,
           A=0,
           undersampler=0,
           sentiment=0):
    test = pd.read_csv('documents\csv\drunk\drunkTEXT400' + '.csv')
    test.L = test.L.replace(['y', 'n'], ['True', 'False'])
    df1 = pd.read_csv(document + '.csv')
    df1.L = df1.L.replace(['y', 'n'], ['True', 'False'])
    joinc = joindocuments(df1, df1)
    top = topwords(df1, 'Clean tweet', 100)
    main_domain = join(df1, 'Clean tweet')

    bigrams = ngrams(df1, 'Clean tweet')
    print 'bigrams'
    print bigrams.bigrams
    main_domain.joinall(bigrams.bigrams, 2)
    main_domain.joinall(top.top, 1)

    main_domain.df.to_csv('prueba.csv', index=False)
    ratiov = ratio(main_domain.df, 'L')
    ratios = ratiov.getoddratios(top.top)
    print 'ratios'
    print ratios
    ds = list(ratios.keys())
    testobject = join(test, 'Clean tweet')
    oddradiojoin = join(df1, 'Clean tweet')
    oddradiojoin.joinall(ds, 1)
    testobject.joinall(ds, 1)
    oddradiojoin.joinall(bigrams.bigrams, 2)
    testobject.joinall(bigrams.bigrams, 2)
    test = testobject.df
    cols = ['Clean tweet']
    if sentiment == 1:
        cols = [
            'Clean tweet', 'sentiment_polarity', 'sentiment_subjectivity',
            'absPolarity'
        ]

    try:
        for x in cols:
            del oddradiojoin.df[x]
            del test[x]
    except:
        pass
    #training, test=joinc.gettrainingandtestp(oddradiojoin.df)
    print 'matrix of elements to reduce'
    print "saul,", oddradiojoin.df.shape
    #########################################################
    if undersampler == 1:
        print "saul,", oddradiojoin.df.shape
        oddradiojoin.df = joinc.undersampling(oddradiojoin.df)
        print oddradiojoin.df.shape
    if A == 1:

        dftraining, dftest = pcaf(oddradiojoin.df, test)
        oddradiojoin.df = dftraining.join(oddradiojoin.df["L"])

        test = dftest.join(test["L"])

    print oddradiojoin.df.shape
    training = oddradiojoin.df

    training = training.replace(['True', 'False'], [True, False])
    test = test.replace(['True', 'False'], [True, False])
    training = training.astype(np.float64)
    test = test.astype(np.float64)
    training['L'] = training['L'].astype(bool)
    test['L'] = test['L'].astype(bool)
    A = str(A)
    sentiment = str(sentiment)
    oddradiojoin.df.to_csv('crossvalidation.csv', index=False)
    #undersampleddf1.to_csv(str(crossvalidationundersampling) +'\undersampling'+A+'.csv',index=False)
    headers_names = list(training.columns.values)
    headers_names.remove('L')
    headers_names.append('L')
    headers_names1 = list(test.columns.values)
    print headers_names, 'heathers test', headers_names1
    test = test[headers_names]
    training = training[headers_names]
    print 'training' + str(training.dtypes)
    test.to_csv(str(crossvalidationundersampling) + r'\test1' + A + '.csv',
                index=False)
    training.to_csv(str(crossvalidationundersampling) + r'\training1' + A +
                    '.csv',
                    index=False)
    TRAINING = training.as_matrix(columns=None)
    TEST = test.as_matrix(columns=None)
    print 'training'
    print training.dtypes

    arff.dump(ArffL + r'\trainingwu' + A + str(undersampler) + sentiment +
              '.arff',
              TRAINING,
              relation="whatever",
              names=headers_names)

    arff.dump(ArffL + r'\testwu' + A + str(undersampler) + sentiment + '.arff',
              TEST,
              relation="whatever",
              names=headers_names)