Beispiel #1
0
def cleanMP(raw):
    record = []
    for i in range(len(raw)):
        d1 = {}
        try:
            if raw[i][0].find('ताम') == -1:
                raw[i].remove(raw[i][0])
        except:
            print('Removed')
        try:
            d1['hname'] = raw[i][0].split(':')[1].strip()  #
            d1['name'] = transliterate(raw[i][0].split(':')[1].strip(),
                                       DEVANAGARI, HK)
            d1['name'] = d1['name'].replace('sinha', 'singh')
        except:
            d1['name'] = ''
        try:
            d1['hfather/husband'] = raw[i][1].split(':')[1].strip()  #
            d1['father/husband'] = transliterate(
                raw[i][1].split(':')[1].strip(), DEVANAGARI, HK)
        except:
            d1['father/husband'] = ''
        try:
            d1['age'] = ''
            for x in range(len(raw[i][3])):
                if ord(raw[i][3][x]) >= 48 and ord(raw[i][3][x]) <= 57:
                    d1['age'] = d1['age'] + raw[i][3][x]
            if len(d1['age']) != 2: d1['age'] = ''
        except:
            d1['age'] = ''
        d1['sex'] = 'M' if any('पुरूष' in myraw for myraw in raw[i]) else 'F'
        #            print(d1)
        record.append(d1)
    return record
Beispiel #2
0
def transliterate_word(text, lang='H'):
    for p in ",.><?/+=-_}{[]*&^%$#@!~`\"\\|:;()":
        text = text.replace(p, '')
    if text in ",.><?/+=-_}{[]*&^%$#@!~`\"\\|:;()":
        return []
    text = text.encode('utf-8')
    dict_to_use = hindi_dictionary if lang == 'H' else marathi_dictionary

    found_direct = False

    immediate_tranlisteration = transliterate(text, HK, DEVANAGARI)
    attempt = list(dict_to_use.find({'word': immediate_tranlisteration}))
    if attempt:
        return [attempt[0]['word']]

    attempt = list(dict_to_use.find({'transliterated': {'$in': [text, text.lower()]}}))
    if attempt:
        return [attempt[0]['word']]

    google_transliteration = google_transliterate(text, lang)
    for tr in google_transliteration:
        ll = list(dict_to_use.find({'word': tr}))
        if ll and len(ll) > 0:
            return [li['word'] for li in ll]

    sound = soundex(text)
    attempt = list(dict_to_use.find({'sound': sound}))
    neighbors = sorted([(i['word'], distance(text, i['transliteration'])) for i in attempt], key=lambda x: x[1])
    return [u[0] for u in neighbors if u[1] < 3]
Beispiel #3
0
def train(train_ratio, test_ratio):
    MFILE = 'sanscript4.crfsuite'
    count = 0
    train_sents = []
    test_sents = []
    with open(RESOURCE_FOLDER+'/Output/all-words-pairs.txt') as infile:
        for line in infile:
            count += 1
            line = line.strip()
            e, h = line.split('\t')
            e = e  # .decode('utf-8')
            h = h.decode('utf-8')
            ie = transliterate(h, DEVANAGARI, HK).encode('utf-8')
            if random() < train_ratio:
                train_sents.append((e, ie))
            if random() > test_ratio:
                test_sents.append((e, h))

    print '[*] NOT TRAINING'
    X_train = [s[0] for s in train_sents]
    y_train = [s[1] for s in train_sents]
    X_test = [s[0] for s in test_sents]
    y_test = [s[1] for s in test_sents]

    print 'Loading trainer...'
    print len(X_train), len(y_train)
    print len(X_test), len(y_test)

    trainer = pycrfsuite.Trainer(verbose=False)

    count = 0
    skipped = 0
    for xseq, yseq in zip(X_train, y_train):
        if count % 100 is 0:
            print count
        try:
            xss = [xs for xs in ngrams(xseq)]
            yss = [ys for ys in ngrams(yseq)]
            for xs in xss:
                for ys in yss:
                    count += 1
                    trainer.append([xs], [ys])
        except Exception, e:
            print str(e)
            skipped += 1
Beispiel #4
0
    print '[Done]'
    tagger = pycrfsuite.Tagger()
    tagger.open(MFILE)
    print '[Begin Testing]...'
    count = 0
    print 'Writing to output.txt'
    with codecs.open('output.txt', 'w', encoding='utf-8') as outfile:
        for xseq, yseq in zip(X_test, y_test):
            count += 1
            if count % 100 is 0:
                print count
            xss = [xs for xs in esplitclusters(xseq)]
            ypredicted = tagger.tag(xss)
            ypredicted2 = tagger.tag([xss])
            # print xss, ypredicted
            _ie = transliterate(''.join(ypredicted), HK, DEVANAGARI)
            v1 = "%s\t" % xseq
            v2 = "%s\t" % yseq
            v3 = "%s\t" % [u.decode('utf-8') for u in ypredicted]
            v4 = "%s\t" % ''.join(ypredicted).decode('utf-8')
            v5 = "%s\t" % [u.decode('utf-8') for u in ypredicted2]
            v6 = "%s\t" % ','.join(ypredicted2).decode('utf-8')
            v7 = "%s\t" % _ie
            val = v1 + v2 + v3 + v4 + v5 + v6 + v7 + '\n'
            outfile.write(val)


if __name__ == '__main__':
    train(0.04, 0.2)
    # print ngrams('arshad')
    # print ngrams('something')
Beispiel #5
0
db = conn.sentiment_analysis_db

import pickle
path = '../files/hindi/'
word2Synset = pickle.load(open(path + "WordSynsetDict.pk"))

# dmetaphone = fuzzy.DMetaphone()
soundex = fuzzy.Soundex(4)

print db.hindi_dictionary.drop_indexes()
print db.hindi_dictionary.remove({})

words = []

for word in word2Synset.keys():
    transliterated = strip_non_ascii(transliterate(word, DEVANAGARI, HK))
    synsets = []
    for vv in word2Synset[word].values():
        synsets.extend(vv)

    lower = transliterated.lower()
    sound = soundex(lower.decode('ascii', errors='ignore'))
    words.append({
        'word': word,
        'synsets': synsets,
        'transliteration': lower,
        'sound': sound
    })
    if len(words) > 1000:
        db.hindi_dictionary.insert_many(words)
        words = []
Beispiel #6
0
def start():
    size_diff, total = 0, 0
    sum_total = defaultdict(int)

    with open(RESOURCE_FOLDER + '/Output/all-words-pairs.txt') as infile:
        for line in infile:
            total += 1
            line = line.strip()
            e, h = line.split('\t')
            e = e.decode('utf-8')
            h = h.decode('utf-8')
            op = transliterate(h, DEVANAGARI, HK)
            xss = [u for u in ngrams(e)]
            yss = [u for u in ngrams(op)]

            if total > 2000:
                break

            for _u in xss:
                for _v in yss:
                    # print _u
                    # print _v
                    u = _u[0]
                    u_prev = _u[1]
                    u_next = _u[2]
                    v = _v[0]
                    v_prev = _v[1]
                    v_next = _v[2]
                    pointers[u][v] += 1
                    for i in xrange(len(u_prev)):
                        x_previous_tags[u][u_prev[i:]] += 1
                    for j in xrange(1, len(u_next)):
                        x_next_tags[u][u_next[:j]] += 1
                    for i in xrange(len(v_prev)):
                        y_previous_tags[v][v_prev[i:]] += 1
                    for j in xrange(1, len(v_next)):
                        x_next_tags[v][v_next[:j]] += 1

                    sum_total[u] += len(yss)
            if total % 1000 is 0:
                print total
    print len(pointers), '...'
    print 'Normalizing and saving probabilities'
    with codecs.open('probabilities.txt', 'w', encoding='utf8') as outfile:
        for k, vdict in pointers.iteritems():
            tot = sum_total[k]
            values = sorted([(u, (v * factor(k, u)) / tot)
                             for (u, v) in vdict.iteritems()],
                            reverse=True,
                            key=lambda x: x[1])
            ll = int(len(vdict) * .3)
            if ll > 10:
                ll = 10
            values = values[:ll]
            for value in values:
                u, v = value
                if v < 0.001:
                    continue
                outfile.write("%s\t%s\t%f\n" % (k, u, v))
            outfile.flush()

    print 'Writing x previous'
    with codecs.open('x_previous.txt', 'w', encoding='utf8') as outfile:
        for k, vdict in x_previous_tags.iteritems():
            for u, v in vdict.iteritems():
                outfile.write("%s\t%s\t%f\n" % (k, u, v))
            outfile.flush()
    print 'Writing x next'
    with codecs.open('x_next.txt', 'w', encoding='utf8') as outfile:
        for k, vdict in x_next_tags.iteritems():
            for u, v in vdict.iteritems():
                outfile.write("%s\t%s\t%f\n" % (k, u, v))
            outfile.flush()
    print 'Writing y previous'
    with codecs.open('y_previous.txt', 'w', encoding='utf8') as outfile:
        for k, vdict in y_previous_tags.iteritems():
            for u, v in vdict.iteritems():
                outfile.write("%s\t%s\t%f\n" % (k, u, v))
            outfile.flush()
    print 'Writing y next'
    with codecs.open('y_next.txt', 'w', encoding='utf8') as outfile:
        for k, vdict in y_next_tags.iteritems():
            for u, v in vdict.iteritems():
                outfile.write("%s\t%s\t%f\n" % (k, u, v))
            outfile.flush()

    print 'Total:', total
from Levenshtein import distance
import fuzzy
import sys

conn = MongoClient()
db = conn.sentiment_analysis_db

soundex = fuzzy.Soundex(4)

for line in open('../../resources/word-frequency-hindi.txt'):
    line = line.strip()
    word, freq = line.split('\t')
    word = word.decode('utf-8')  # .replace('\0xef', '')
    found = db.hindi_dictionary.find_one({'word': word})
    if not found:
        transliterated = transliterate(word, DEVANAGARI, HK)
        transliterated = strip_non_ascii(transliterated)
        found = db.hindi_dictionary.find_one(
            {'transliterated': transliterated})
        if not found:
            sound = soundex(transliterated)
            sounding_same = list(db.hindi_dictionary.find({'sound': sound}))
            if len(sounding_same) > 0:
                found = sorted([(i['word'], distance(word, i['word']))
                                for i in sounding_same],
                               key=lambda x: x[1])[0][0]
        else:
            found = found['word']
    else:
        found = found['word']
    print word, found