Beispiel #1
0
    def extractEmojiTags(self, document:Document):
        # Iterate over each of the tagged portions of the document to identify the emojis
        absolutePos = 0
        taggedIndex = 0
        collection = []
        for tagged in document.get("tagged"):
            currCollection = []
            collection = []
            relativePos = 0
            for c in tagged:
                # Iterate over each character to find if this charcter is an emoji
                if c in emoji.UNICODE_EMOJI:
                    print ("{} is an emoji".format(c))
                    # Now that we have found an emoji, close currCollection
                    text="".join(currCollection)
                    if text != "":
                        jiji = CollectionTuple(text=text)
                        jiji.set("relativePos", relativePos)
                        jiji.set("absolutePos", absolutePos)
                        jiji.set("len", len(text))  
                        collection.append(jiji.getJiji())
                        currCollection = []
                        relativePos += len(text)
                        absolutePos += len(text)

                    # Now add the emoji to collection
                    jiji = Emoji(lang="emoji", text=c)
                    jiji.set("relativePos", relativePos)
                    jiji.set("absolutePos", absolutePos)
                    jiji.set("len", 1)
                    collection.append(jiji.getJiji())
                    relativePos += 1
                    absolutePos += 1
                    # TODO: Add the rest of the attributes

                else:
                    # Keep adding to the current row in collection
                    currCollection.append(c)
                #relativePos += 1
                #absolutePos += 1
            text="".join(currCollection)
            if text != "":
                jiji = CollectionTuple(text=text)
                jiji.set("relativePos", relativePos)
                jiji.set("absolutePos", absolutePos)
                jiji.set("len", len(text))
                collection.append(jiji.getJiji())
                
            # TODO: Replace the current tagged position with jijiCollection and increment the taggedIndex.
            #print ("Collection is {}".format(collection))
            # TODO: Replace the current tagged Index with this collection
            taggedIndex += 1
        return collection
Beispiel #2
0
    def extractLanguageTags(self, document: Document):
        # Iterate over each of the tagged portions of the document to identify the languages
        absolutePos = 0
        taggedIndex = 0
        collection = []
        for tagged in document.get("tagged"):
            if tagged["lang"] == "emoji":
                collection.append(tagged)
                continue

            text = tagged["text"]
            isReliable, textBytesFound, details, vectors = cld2.detect(
                text, returnVectors=True)

            #print('  reliable: %s' % (isReliable != 0))
            #print('  textBytes: %s' % textBytesFound)
            #print('  details: %s' % str(details))
            i = 0
            for vector in vectors:
                #print ("*************")
                #print (vector)
                #print (details[i])
                start = vector[0]
                end = vector[1]
                #print ("Start : {}, end : {}".format(start, start+end))
                #print (vector)
                #print (text[start:start+end])

                jiji = CollectionTuple(text=text[start:start + end])
                jiji.set("relativePos", start)
                jiji.set("absolutePos", start)
                jiji.set("lang", vector[3])
                jiji.set("len", end - start)  # Length of this block
                collection.append(jiji.getJiji())

                i += 1
                #print ("*************")
        return collection
 def correct(self, document:Document):
     # Iterate over each of the tagged portions of the document
     # Note that language tagging should have been done before this step.
     collection = []
     for tagged in document.get("tagged"):
         if tagged["lang"] == "emoji":
             collection.append(tagged)
             continue
         if tagged["lang"] == "un":
             text = tagged["text"]
             corrected = []
             for word in text.split(" "):
                 # TODO: More thoughts need to get into this.
                 cword = spell.correction(word)
                 print ("Original : {}, corrected : {}".format(word, cword))
                 corrected.append(cword)
             
             jiji = CollectionTuple(text=" ".join(corrected))
             collection.append(jiji.getJiji())
         else:
             # Other languages go here. It's empty for now.
             collection.append(tagged)
             continue
     return collection
Beispiel #4
0
'''
@author mojosaurus
This is the file that executes the pipeline
'''
import os, sys
# Appeding our src directory to sys path so that we can import modules.
sys.path.append(os.path.join(os.path.dirname(__file__), '../../..'))

from src.tn.docproc.pipeline import Step
from src.tn.docproc.whitespace import Whitespace
from src.tn.docproc.lowercase import Lowercase
from src.tn.document.document import Document
from src.tn.docproc.regexes import Regexes
from src.tn.docproc.pipeline import Pipeline
from src.tn.docproc.emojitagger import EmojiTagger
from src.tn.docproc.languagetagger import LanguageTagger
from src.tn.docproc.spellchecktagger import SpellCheckTagger

if __name__ == "__main__":
    text = "woooood issssss your oyster.... ЁЯе░ ###!!! р░кр▒Нр░░р░кр░Вр░Ър░В р░Ер░Вр░др░╛ р░╡р▒Жр░др░┐р░Хр░┐р░и р░зр░ир▒Бр░╖р▒Н ЁЯдй р░▓р░╛р░Вр░Яр░┐ р░ор░░р▒Л р░ир░Яр▒Бр░бр▒Б р░жр▒Кр░░р░Хр░бр▒Б, ЁЯдй роЪрпБропрооро╛роХ роЪро┐роирпНродро┐роХрпНроХ родрпЖро░ро┐роЮрпНроЪро╡ройрпН родро╛ройрпН роЪрпВрокрпНрокро░рпН ро╣рпАро░рпЛ ЁЯе░ р▓мр▓ар▓кр▓вр▓Э р▓Ьр▓Вр▓Ер▓Вр▓З р▓Лр▓Л р▓бр▓Шр▓лр▓лр▓Э р▓бр▓Эр▓лр▓╖"
    text = "woooood issssss your oyester       .... ЁЯе░ ###!!! роЪрпБропрооро╛роХ роЪро┐роирпНродро┐роХрпНроХ ЁЯдй beer"
    doc = Document(text)
    pipeline = Pipeline()

    pipeline.addStep(Whitespace())
    pipeline.addStep(Lowercase())
    pipeline.addStep(Regexes())
    pipeline.addStep(EmojiTagger())
    pipeline.addStep(LanguageTagger())
    pipeline.addStep(SpellCheckTagger())
    pipeline.process(doc)
Beispiel #5
0
 def __init__(self, document=Document()):
     self.document = document
     self.helper = SpellCheckHelper()
     print("Inside object of type : {}".format(self.__class__.__name__))
Beispiel #6
0
 def __init__(self, document=Document()):
     self.document = document
     print("Inside object of type : {}".format(self.__class__.__name__))