Exemple #1
0
def main():
    if len(sys.argv) != 2:
        print "Usage: %s inputFile" % sys.argv[0]
        sys.exit(1)

    ifile = codecs.open(sys.argv[1], 'r', encoding='utf-8')
    ofile = codecs.open(sys.argv[1] + '.syllables', 'w', encoding='utf-8')

    prev_syllables = u""
    prev_count = 0

    for line in (l.strip() for l in ifile):
        syllables = list(ml.ClusterIter(line))  #ml.ClusterIter(line).next ()

        if len(syllables) < 2:
            syllables = "".join(syllables)
        else:
            syllables = "".join(syllables[:2])
            if u"\u1039" in syllables or u'\u103F' in syllables:
                continue

        if syllables == prev_syllables:
            prev_count += 1
        else:
            #if (u"\u1039" in prev_syllables and prev_count > 25) or (prev_count > 50):
            if (prev_count > 50):
                ofile.write('%s\n' % (prev_syllables))
                #else:
                #print "skipping ", prev_syllables.encode ('utf8')
            prev_count = 0
            prev_syllables = syllables

    ifile.close()
    ofile.close()
Exemple #2
0
def main():
    if len(sys.argv) != 2:
        print "Usage: %s inputFile" % sys.argv[0]
        sys.exit(1)

    ifile = codecs.open(sys.argv[1], 'r', encoding='utf-8')
    ofile = codecs.open(sys.argv[1] + '.pcol', 'w', encoding='utf-8')

    for line in (l.strip() for l in ifile):
        for c in ml.ClusterIter(line):
            ofile.write("%s " % c)
        ofile.write('\n')

    ifile.close()
    ofile.close()
Exemple #3
0
def main():
    if len(sys.argv) != 2:
        print "Usage: %s inputFile" % sys.argv[0]
        sys.exit(1)

    ifile = codecs.open(sys.argv[1], 'r', encoding='utf-8')
    ofile = codecs.open(sys.argv[1] + '.prefixes', 'w', encoding='utf-8')

    prev_syllables = []
    prev_count = 0
    match = False

    for line in (l.strip() for l in ifile):
        syllables = list(ml.ClusterIter(line))  #ml.ClusterIter(line).next ()

        if len(syllables) <= 1:
            match = False
            prev_syllables = []
            prev_count = 0
            continue

        if match:
            if startswith(syllables, match):
                #print mystr(syllables).encode ('utf-8'), " startswith ", mystr(match).encode ('utf-8')
                prev_count += 1
            else:
                #print mystr(syllables).encode ('utf-8'), " not startswith ", mystr(match).encode ('utf-8')
                if prev_count > 5:
                    ofile.write(mystr(match) + '\n')
                    #else:
                    #print "omitting ", mystr(match).encode ('utf-8')
                prev_syllables = syllables
                prev_count = 0
                match = False
        else:
            match = get_longest_match(syllables, prev_syllables)
            #print "syllable = %s, prev_syl =%s , match = %s" %(mystr (syllables).encode ('utf-8'),\
            #mystr (prev_syllables).encode ('utf-8'),\
            #mystr (match).encode ('utf-8'))
            if len(match) < 2:
                #if prev_syllables:
                #print "omitting ", mystr(prev_syllables).encode ('utf-8')
                #     ofile.write (mystr (prev_syllables) + '\n')
                prev_syllables = syllables
                match = False

    ifile.close()
    ofile.close()
Exemple #4
0
def main():
    if len(sys.argv) != 2:
        print "Usage: %s inputFile" % sys.argv[0]
        sys.exit(1)

    ifile = codecs.open(sys.argv[1], 'r', encoding='utf-8')

    for k, line in enumerate((l.strip() for l in ifile)):
        #print line.encode ('utf8')
        try:
            syllables = list(ml.ClusterIter(line))
        except:
            pass
        if len(syllables) == 1 and u'\u1039' not in syllables[0]:
            print "".join(syllables).encode('utf8')
    ifile.close()
Exemple #5
0
def main():
    if len(sys.argv) != 2:
        print "Usage: %s inputFile" % sys.argv[0]
        sys.exit(1)

    if (os.path.exists(sys.argv[1] + '.dpkl')):
        print "Found pickle."
        with open(sys.argv[1] + '.dpkl', 'rb') as fil:
            finder4 = cPickle.load(fil)
    else:
        ifile = codecs.open(sys.argv[1], 'r', encoding='utf-8')

        syllables = []
        skip = "per()".join(ml.puncts + ml.digits)
        for i, line in enumerate(l.strip() for l in ifile):
            progress(i)
            line = line.replace(" ", "")
            for p in skip:
                line = line.replace(p, "")
            try:
                p = [c for c in ml.ClusterIter(line)]
                for each in possible_bigram_list(p):
                    syllables += ["".join(each)]
            except Exception, e:
                print str(e), line, c
        ifile.close()

        finder4 = BigramCollocationFinder.from_words(syllables)
        finder4.apply_freq_filter(10)  # only bigrams that appear 3+ times

        # remove stopwords
        stopwords = []
        with codecs.open('stopwords', 'r', 'utf8') as stopwordFile:
            for each in stopwordFile:
                stopwords.append(each.strip())

        def contains_stopwords(*syllables):
            for each in stopwords:
                if each in syllables:
                    return True
            return False

        finder4.apply_ngram_filter(contains_stopwords)

        with open(sys.argv[1] + '.dpkl', 'wb') as fil:
            cPickle.dump(finder4, fil)
Exemple #6
0
def main():
    if len(sys.argv) != 2:
        print "Usage: %s inputFile" % sys.argv[0]
        sys.exit(1)

    ifile = codecs.open(sys.argv[1], 'r', encoding='utf-8')

    for k, line in enumerate((l.strip() for l in ifile)):
        print line.encode('utf8')
        syllables = list(ml.ClusterIter(line))
        bigrams = nltk.bigrams(syllables)
        trigrams = nltk.trigrams(syllables)
        tetragrams = nltk.bigrams(["".join(x) for x in _bigrams(syllables)
                                   ]) if len(syllables) > 3 else []
        for each in itertools.chain(bigrams, trigrams, tetragrams):
            print "".join(each).encode('utf8')
    ifile.close()
Exemple #7
0
def main():
    if len(sys.argv) != 3:
        print "Usage: %s syllableFile inputFile" % sys.argv[0]
        sys.exit(1)

    p = re.compile(
        u'(\u1004\u103a\u1039)?((\u1000\u103b\u1015\u103a|\u100b\u1039\u100b|\u100b\u1039\u100c|\u100d\u1039\u100d|\u100d\u1039\u100e|\u100f\u1039\u100d)|((\u104e\u1004\u103a\u1038|\u1000|\u1001|\u1002|\u1003|\u1004|\u1005|\u1006|\u1007|\u1008|\u1009|\u100a|\u100b|\u100c|\u100d|\u100e|\u100f|\u1010|\u1011|\u1012|\u1013|\u1014|\u1015|\u1016|\u1017|\u1018|\u1019|\u101a|\u101b|\u101c|\u101d|\u101e|\u101f|\u1020|\u1021|\u1023|\u1024|\u1025|\u1026|\u1027|\u1029|\u102a|\u103f|\u1040|\u1041|\u1042|\u1043|\u1044|\u1045|\u1046|\u1047|\u1048|\u1049|\u104a|\u104b|\u104c|\u104d|\u104e|\u104f)(\u1039\u1010\u103d|\u1039\u1000|\u1039\u1001|\u1039\u1002|\u1039\u1003|\u1039\u1005|\u1039\u1006|\u1039\u1007|\u1039\u1008|\u1039\u100b|\u1039\u100c|\u1039\u100d|\u1039\u100e|\u1039\u100f|\u1039\u1010|\u1039\u1011|\u1039\u1012|\u1039\u1013|\u1039\u1014|\u1039\u1015|\u1039\u1016|\u1039\u1017|\u1039\u1018|\u1039\u1019|\u1039\u101a|\u1039\u101b|\u1039\u101c|\u1039\u101e|\u1039\u1021)?))(\u103a)?(\u103b)?(\u103c)?(\u103d)?(\u103e)?(\u1031)?(\u102d|\u102e)?(\u102f|\u1030)?(\u1032|\u1036)?(\u102b|\u102c)?(\u1037)?(\u103a)?(\u1037)?(\u1038)?',
        re.U)

    sfile = codecs.open(sys.argv[1], 'r', encoding='utf-8')
    ifile = codecs.open(sys.argv[2], 'r', encoding='utf-8')
    ofile = codecs.open(sys.argv[2] + '.checked', 'w', encoding='utf-8')

    for line in (l.strip() for l in ifile):
        for i in ml.ClusterIter(line):
            if not p.search(i):
                ofile.write(i + '\t' + line + '\n')
                break

    sfile.close()
    ifile.close()
    ofile.close()
Exemple #8
0
def main():
    if len(sys.argv) != 2:
        print "Usage: %s inputFile" % sys.argv[0]
        sys.exit(1)

    ifile = codecs.open(sys.argv[1], 'r', encoding='utf-8')
    ofile = codecs.open(sys.argv[1] + '.pwar2', 'w', encoding='utf-8')

    prev_syllables = []
    prev_count = 0
    match = []
    tmp = []
    pos = 0

    for k, line in enumerate((l.strip() for l in ifile)):
        if (k % 1000) == 0:
            print k

        syllables = list(ml.ClusterIter(line))  #ml.ClusterIter(line).next ()

        if (len(syllables) < 1):  # < 2
            match = []
            prev_syllables = []
            prev_count = 0
            continue

        #ofile.write ("\n\ns=%s p=%s:m=%s\n" %(mystr(syllables), mystr(prev_syllables), mystr(match)))
        if prev_syllables:
            if len(match) > 1 and (len(match) + 1) < len(prev_syllables):
                ofile.write("%s\n" % mystr(prev_syllables[len(match):]))

        if match:
            if startswith(syllables, match):
                prev_count += 1
            else:
                #for u in range (prev_count):
                ofile.write("%s\n" % mystr(match))
                match = []
                prev_syllables = []
                prev_count = 0

        else:
            match = get_longest_match(syllables, prev_syllables)
            #ofile.write ("==> prev_count= %d match=%s\n" %(prev_count,mystr(syllables)))

            if len(match) < 2:
                #if len(match) == 0:
                #ofile.write ("==> here= t=%s ps=%s\n" %(mystr(tmp),mystr(prev_syllables)))
                if tmp and len(get_longest_match(tmp, prev_syllables)) < 2:
                    if len(prev_syllables) > 2:
                        ofile.seek(pos)
                        ofile.write("%s\n" % mystr(prev_syllables[1:]))
                tmp = prev_syllables
                match = []
                prev_count = 0
            else:
                if (len(match) + 1) < len(prev_syllables):
                    ofile.write("%s\n" % mystr(prev_syllables[len(match):]))
                prev_count += 1

        pos = ofile.tell()
        ofile.write("%s\n" % mystr(syllables))
        prev_syllables = syllables

    ifile.close()
    ofile.close()