コード例 #1
0
ファイル: scripts.py プロジェクト: dumitrescustefan/ronec
def write_core_format_into_conllup_file(sentences, filepath):
    print(
        "Converting {} sentences into CONLLUP format. This requires a text preprocessor for Romanian. If the following function fails please install NLP-Cube (pip3 install nlpcube)."
    )

    from cube.api import Cube
    cube = Cube(verbose=True)
    cube.load("ro",
              tokenization=True,
              compound_word_expanding=False,
              tagging=True,
              lemmatization=True,
              parsing=True)
    cube_no_tok = Cube(verbose=True)
    cube_no_tok.load("ro",
                     tokenization=False,
                     compound_word_expanding=False,
                     tagging=True,
                     lemmatization=True,
                     parsing=True)

    conllupdataset = []
    for sentence in sentences:
        sentence = process_split_exceptions(sentence)
        conllupsentence = _conllup_to_core_sentence(sentence, cube,
                                                    cube_no_tok)
        conllupdataset.append(conllupsentence)

    write_file(filepath, conllupdataset)
コード例 #2
0
def main(filename):
    cube = Cube(verbose=True)
    cube.load('en')

    with open('words.txt') as f:
        word_list = [line.rstrip() for line in f.readlines()]
        word_set = set(word_list)

    with open('my_words.txt') as f:
        my_word_list = [line.rstrip() for line in f.readlines()]
        my_word_set = set(my_word_list)

    text = srt_to_text(filename)
    sentences = cube(text)
    new_words = []
    for sentence in sentences:
        for entry in sentence:
            if entry.lemma in word_set and entry.lemma not in my_word_list:
                if entry.lemma not in new_words:
                    new_words.append(entry.lemma)
    print('-' * 100)
    print(f'{len(new_words)} new words are found.')
    print('-' * 100)
    for i, word in enumerate(new_words):
        print(i, word)
コード例 #3
0
 def test_4_3_run_model_with_default_external_embeddings(self):  
     print("\n\33[33m{}\33[0m".format("4.3. Run a local model with default external embeddings ..."))                        
     from cube.api import Cube
     cube = Cube(verbose=True)
     cube.load('my_model', tokenization=True, compound_word_expanding=False, tagging=True, lemmatization=True, parsing=True)        
     text = "I'm a success today because I had a friend who believed in me and I didn't have the heart to let him down. This is a quote by Abraham Lincoln."
     sentences = cube(text)
     self.assertTrue(len(sentences)>0)
     self.assertTrue(len(sentences[0])>0)       
コード例 #4
0
 def test_2_run_a_local_model(self):  
     print("\n\33[33m{}\33[0m".format("2. Run a local model that does not have embeddings or metadata (running with dummy.vec embeddings) ..."))
     embeddings = os.path.join(self.root_path, "examples","wiki.dummy.vec")
     from cube.api import Cube
     cube = Cube(verbose=True)
     cube.load('my_model', tokenization=True, compound_word_expanding=False, tagging=True, lemmatization=True, parsing=True, local_models_repository=self.local_model_repo, local_embeddings_file=embeddings)        
     text = "I'm a success today because I had a friend who believed in me and I didn't have the heart to let him down. This is a quote by Abraham Lincoln."
     sentences = cube(text)
     self.assertTrue(len(sentences)>0)
     self.assertTrue(len(sentences[0])>0)   
コード例 #5
0
 def test_1_2_download_and_run_an_online_model_specific_version(self):                                    
     print("\n\33[33m{}\33[0m".format("1.2. Loading an online model (sme, 1.0) ..."))
     from cube.api import Cube
     cube = Cube(verbose=True)
     cube.load('sme', version='1.0', tokenization=True, compound_word_expanding=False, tagging=False, lemmatization=False, parsing=False)
     cube.metadata.info()
     text = "I'm a success today because I had a friend who believed in me and I didn't have the heart to let him down. This is a quote by Abraham Lincoln."
     sentences = cube(text)
     self.assertTrue(len(sentences)>0)
     self.assertTrue(len(sentences[0])>0)        
コード例 #6
0
def prepare_dialogs_sorted_by_lang(dialog_ids,
                                   dialog_path,
                                   prepared_path,
                                   start_date,
                                   end_date,
                                   additional_options=""):
    dialog_ids_sorted_by_lang = {"ua": [], "ru": [], "en": []}
    if dialog_ids[0] == -1:
        for filename in os.listdir(dialog_path):
            data = pd.read_csv(f"{dialog_path}/{filename}")
            lang = detect_data_language(data)
            dialog_ids_sorted_by_lang[lang].append(filename[:-4])

    else:
        for dialog in dialog_ids:
            data = pd.read_csv(f"{dialog_path}/{dialog}.csv")
            lang = detect_data_language(data)
            dialog_ids_sorted_by_lang[lang].append(dialog)

    print("dialog_ids_sorted_by_lang")
    pprint(dialog_ids_sorted_by_lang)

    n_all_dialogs = sum([
        len(dialog_ids_sorted_by_lang[lang])
        for lang in dialog_ids_sorted_by_lang.keys()
    ])
    n_dialog = 0
    for lang in dialog_ids_sorted_by_lang.keys():
        if not dialog_ids_sorted_by_lang[lang]:
            continue

        cube = ""
        if lang == "ua":
            cube = Cube(verbose=True)
            cube.load("uk")

        elif lang == "en":
            cube = Cube(verbose=True)
            cube.load("en")

        for dialog_id in dialog_ids_sorted_by_lang[lang]:
            if f"{dialog_id}.csv" in os.listdir(prepared_path):
                print(
                    f"=========WARNING: {dialog_id}.csv already in {prepared_path}"
                )
                n_dialog += 1
                continue

            n_dialog += 1
            print(
                f"\n=======Language {lang}, dialog_id {dialog_id}-- {n_dialog} from {n_all_dialogs}======="
            )
            prepare_dialogs(lang, cube, dialog_id, prepared_path, dialog_path,
                            start_date, end_date, "words_frequency",
                            additional_options)
コード例 #7
0
 def test_3_3_run_model_with_manual_embeddings(self):  
     print("\n\33[33m{}\33[0m".format("3.3. Run a local model with manual embeddings ..."))                
     embeddings = os.path.join(self.root_path, "examples","wiki.dummy.vec")
     print("\t\tPath to local manual embeddings file: "+embeddings)
     from cube.api import Cube
     cube = Cube(verbose=True)
     cube.load('my_model', tokenization=True, compound_word_expanding=False, tagging=True, lemmatization=True, parsing=True, local_embeddings_file=embeddings)        
     text = "I'm a success today because I had a friend who believed in me and I didn't have the heart to let him down. This is a quote by Abraham Lincoln."
     sentences = cube(text)
     self.assertTrue(len(sentences)>0)
     self.assertTrue(len(sentences[0])>0)   
コード例 #8
0
def create_conll_sentences(file_path):
    print("*"*25 + "  Working on transforming the input file '{}' to CoNLL format  ".format(file_path) + "*"*25 + "\n")

    print("Reading the input file...")
    with open(file_path, "r", encoding="utf-8") as file:
        text = file.read()

    print("Loading the 'ro' nlp-cube model...")

    cube = Cube(verbose=False)
    cube.load("ro")

    print("Creating the CoNLL sentences...")
    sentences = cube(text)

    print("\n" + "*"*124 + "\n")

    return sentences
コード例 #9
0
def get_lemmatized_vocabulary(unlemmatized_voc, epi, lang):
    if os.path.isfile('lemmatized_' + lang + '.p'):
        voc = p.load(open('lemmatized_' + lang + '.p', 'rb'))
        return voc

    # Lemmatizer
    lang_acron = {
        'gothic': 'got',
        'latin': 'la',
        'italian': 'it',
        'german': 'de',
        'greek': 'grc',
        'english': 'eng'
    }
    cube = Cube(verbose=False)
    cube.load(lang_acron[lang])

    voc = {}
    keys = list(unlemmatized_voc.keys())
    for i in tqdm(range(len(keys))):
        w = keys[i]
        sents = cube(w)
        if type(sents[0]) == list:
            for sent in sents:
                for token in sent:
                    if token.lemma != '_':
                        voc[w] = [
                            token.lemma,
                            epi.transliterate(token.lemma), 'L'
                        ]
                    else:
                        voc[w] = [
                            token.word,
                            epi.transliterate(token.word), 'T'
                        ]
        else:
            for token in sents:
                if token.lemma != '_':
                    voc[w] = [token.lemma, epi.transliterate(token.lemma), 'L']
                else:
                    voc[w] = [token.word, epi.transliterate(token.word), 'T']
    p.dump(voc, open('lemmatized_' + lang + '.p', 'wb'))
    return voc
コード例 #10
0
 def download_model(self):
     if self.lib.lower() == "stanford":
         print("-----------You are going to use Stanford library-----------")
         if self.lang.lower() == "basque":
             print("-------------You are going to use Basque model-------------")
             # MODELS_DIR = '/home/edercarbajo/eu'
             MODELS_DIR = 'J:\TextSimilarity\eu'
             stanfordnlp.download('eu', MODELS_DIR)  # Download the Basque models
             # config = {'processors': 'tokenize,pos,lemma,depparse',  # Comma-separated list of processors to use
             #           'lang': 'eu',  # Language code for the language to build the Pipeline in
             #           'tokenize_model_path': '/home/edercarbajo/eu/eu_bdt_models/eu_bdt_tokenizer.pt',
             #           # Processor-specific arguments are set with keys "{processor_name}_{argument_name}"
             #           'pos_model_path': '/home/edercarbajo/eu/eu_bdt_models/eu_bdt_tagger.pt',
             #           'pos_pretrain_path': '/home/edercarbajo/eu/eu_bdt_models/eu_bdt.pretrain.pt',
             #           'lemma_model_path': '/home/edercarbajo/eu/eu_bdt_models/eu_bdt_lemmatizer.pt',
             #           'depparse_model_path': '/home/edercarbajo/eu/eu_bdt_models/eu_bdt_parser.pt',
             #           'depparse_pretrain_path': '/home/edercarbajo/eu/eu_bdt_models/eu_bdt.pretrain.pt'
             #           }
             config = {'processors': 'tokenize,pos,lemma,depparse',  # Comma-separated list of processors to use
                       'lang': 'eu',  # Language code for the language to build the Pipeline in
                       'tokenize_model_path': 'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt_tokenizer.pt',
                       # Processor-specific arguments are set with keys "{processor_name}_{argument_name}"
                       'pos_model_path': 'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt_tagger.pt',
                       'pos_pretrain_path': 'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt.pretrain.pt',
                       'lemma_model_path': 'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt_lemmatizer.pt',
                       'depparse_model_path': 'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt_parser.pt',
                       'depparse_pretrain_path': 'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt.pretrain.pt'
                       }
             self.parser = stanfordnlp.Pipeline(**config)
         else:
             print("............Working...........")
     elif self.lib.lower() == "cube":
         print("-----------You are going to use Cube Library-----------")
         if self.lang.lower() == "basque":
             cube = Cube(verbose=True)
             cube.load("eu", "latest")
             self.parser = cube
         else:
             print("............Working...........")
     else:
         print("You cannot use this library. Introduce a valid library (Cube or Stanford)")
コード例 #11
0
def start():
    from analyzer import Analyzer
    p = ArgumentParser(description="python3 ./main.py -f \"laginak/*.doc.txt\" ")
    optional = p._action_groups.pop()  # Edited this line
    required = p.add_argument_group('Required arguments')
    required.add_argument("-f", "--files", nargs='+', help="Files to analyze (in .txt, .odt, .doc or .docx format)")
    optional.add_argument('-a', '--all', action='store_true', help="Generate a CSV file with all the results")
    optional.add_argument('-s', '--similarity', action='store_true', help="Calculate similarity (max. 5 files)")
    p._action_groups.append(optional)
    opts = p.parse_args()
    FileLoader.load_files(opts.files)
    FileLoader.load_irregular_verbs_list()
    FileLoader.load_dale_chall_list()
    FileLoader.load_connectives_list()
    FileLoader.load_oxford_word_list()
    cube = Cube(verbose=True)
    # Cargar modelo Cube
    cube.load("en", "latest")
    df_row = None
    ### Files will be created in this folder
    path = Printer.create_directory(FileLoader.files[0])
    file_num = 0
    total = len(FileLoader.files)
    for input in FileLoader.files:
        texto = Analyzer.process_text(input=input)
        # Analizar
        a = Analyzer(texto, input, cube)
        i = a.analyze(opts.similarity)
        df = a.create_dataframe()
        prediction = a.predict_dificulty(df)
        file_num += 1
        p = Printer(input, i)
        p.print_info(opts.similarity, prediction, file_num, total)
        if opts.all:
            df_row = p.write_in_full_csv(df_row, opts.similarity)
        p.generate_csv(path, prediction, opts.similarity)
    if opts.all:
        df_row.to_csv(os.path.join(path, "full_results_aztertest.csv"), encoding='utf-8', index=False)
コード例 #12
0
 def download_model(self):
     if self.lib.lower() == "stanford":
         print(
             "-----------You are going to use Stanford library-----------")
         if self.lang.lower() == "basque":
             print(
                 "-------------You are going to use Basque model-------------"
             )
             # MODELS_DIR = '/home/kepa/eu'
             MODELS_DIR = 'J:\TextSimilarity\eu'
             stanfordnlp.download('eu',
                                  MODELS_DIR)  # Download the Basque models
         elif self.lang.lower() == "english":
             print(
                 "-------------You are going to use English model-------------"
             )
             MODELS_DIR = '/home/kepa/en'
             print(
                 "-------------Downloading Stanford Basque model-------------"
             )
             stanfordnlp.download('en',
                                  MODELS_DIR)  # Download the Basque models
         elif self.lang.lower() == "spanish":
             print(
                 "-------------You are going to use Spanish model-------------"
             )
             MODELS_DIR = '/home/kepa/es'
             stanfordnlp.download('es',
                                  MODELS_DIR)  # Download the English models
         else:
             print("........You cannot use this language...........")
     elif self.lib.lower() == "cube":
         print("-----------You are going to use Cube Library-----------")
         if self.lang.lower() == "basque":
             cube = Cube(verbose=True)
             cube.load("eu", "latest")
         elif self.lang.lower() == "english":
             cube = Cube(verbose=True)
             cube.load("en", "latest")
         elif self.lang.lower() == "spanish":
             cube = Cube(verbose=True)
             cube.load("es", "latest")
         else:
             print("........You cannot use this language...........")
     else:
         print(
             "You cannot use this library. Introduce a valid library (Cube or Stanford)"
         )
コード例 #13
0
class CubeNLP(TeproApi):
    """By Tibi Boroș & co., does sentence splitting, tokenization,
    POS tagging, lemmatization and dependency parsing for Romanian."""
    def __init__(self):
        super().__init__()
        self._algoName = TeproAlgo.algoCube

    @staticmethod
    def sgml2unicode(word: str) -> str:
        word = word.replace("ă", "ă")
        word = word.replace("Ă", "Ă")
        word = word.replace("â", "â")
        word = word.replace("Â", "Â")
        word = word.replace("î", "î")
        word = word.replace("Î", "Î")
        word = word.replace("ş", "ș")
        word = word.replace("Ş", "Ș")
        word = word.replace("ţ", "ț")
        word = word.replace("Ţ", "Ț")

        return word

    @staticmethod
    def _readMSDMappings():
        m2c = {}

        with open(CTAG2MSDMAPFILE, mode="r") as f:
            for line in f:
                line = line.strip()
                parts = line.split()

                if len(parts) == 2:
                    msd = parts[0]
                    ctg = parts[1]
                    m2c[msd] = ctg
                # end if
            # end for line
        # end open file
        return m2c

    @staticmethod
    def _readTblWordForm():
        tbl = {}
        counter = 0

        with open(TBLWORDFORMFILE, mode="r", encoding="utf-8") as f:
            for line in f:
                counter += 1

                if counter > 0 and counter % 100000 == 0:
                    print("{0}.{1}[{2}]: loading tbl.wordform.ro, at line {3}".
                          format(
                              Path(inspect.stack()[0].filename).stem,
                              inspect.stack()[0].function,
                              inspect.stack()[0].lineno, counter),
                          file=sys.stderr,
                          flush=True)

                line = line.strip()

                if line.startswith("#"):
                    continue

                parts = line.split()

                if len(parts) == 3:
                    word = CubeNLP.sgml2unicode(parts[0])
                    lemma = CubeNLP.sgml2unicode(parts[1])

                    if lemma == '=':
                        lemma = word

                    msd = parts[2]

                    if word not in tbl:
                        tbl[word] = {}

                    if msd not in tbl[word]:
                        tbl[word][msd] = []

                    tbl[word][msd].append(lemma)
                # end if parts has 3 elems
            # end for line in f
        # end while open file
        return tbl

    def createApp(self):
        self._cubeInst = Cube(verbose=True)

    def loadResources(self):
        self._cubeInst.load('ro',
                            tokenization=True,
                            compound_word_expanding=False,
                            tagging=True,
                            lemmatization=True,
                            parsing=True)
        self._tblwordform = CubeNLP._readTblWordForm()
        self._msd2ctag = CubeNLP._readMSDMappings()

    def _runApp(self, dto, opNotDone):
        text = dto.getText()
        sentences = self._cubeInst(text)
        sid = 0

        for sent in sentences:
            # Teprolin tokenized sentence
            ttsent = []
            # Teprolin string sentence
            tssent = ""

            for tok in sent:
                tt = TeproTok()
                tt.setId(tok.index)
                tt.setWordForm(tok.word)
                lowerWord = tok.word.lower()
                tt.setMSD(tok.xpos)

                # Assigning the mapped CTAG to the disambiguated MSD
                if tok.xpos in self._msd2ctag:
                    tt.setCTAG(self._msd2ctag[tok.xpos])
                else:
                    tt.setCTAG(tok.xpos)

                lemmaIsSet = False

                # Doing lexicon lemmatization, if possible.
                if tok.word in self._tblwordform:
                    if tok.xpos in self._tblwordform[tok.word] and \
                            len(self._tblwordform[tok.word][tok.xpos]) == 1:
                        # TODO: if lemma is ambiguous, e.g. 'copii' can be 'copil' or 'copie'
                        tt.setLemma(self._tblwordform[tok.word][tok.xpos][0])
                        lemmaIsSet = True
                elif lowerWord in self._tblwordform and \
                        tok.xpos in self._tblwordform[lowerWord] and \
                        len(self._tblwordform[lowerWord][tok.xpos]) == 1:
                    tt.setLemma(self._tblwordform[lowerWord][tok.xpos][0])
                    lemmaIsSet = True

                if not lemmaIsSet:
                    tt.setLemma(tok.lemma)

                tt.setHead(tok.head)
                tt.setDepRel(tok.label)

                tssent += tok.word

                if tok.space_after != "SpaceAfter=No":
                    tssent += " "

                ttsent.append(tt)
            # end ttsent/tssent formation

            if not dto.isOpPerformed(TeproAlgo.getSentenceSplittingOperName()):
                dto.addSentenceString(tssent)
                dto.addSentenceTokens(ttsent)
            else:
                # Check and update annotations that only NLPCube
                # can produce or that are requested specifically from it.
                alignment = dto.alignSentences(ttsent, sid)

                for op in opNotDone:
                    dto.copyTokenAnnotation(ttsent, sid, alignment, op)

            sid += 1

        return dto
コード例 #14
0
#pentru parsare fisiere din db / carti

import os
import json
import fnmatch
from input_parser import input_parser
from nltk import tokenize
from textwrap import wrap

from cube.api import Cube
cube = Cube(verbose=True)
cube.load('ro')

booksDir = os.path.abspath(
    os.path.realpath(os.path.join(os.path.dirname(__file__), '../../DB')))
dataDir = os.path.abspath(
    os.path.realpath(os.path.join(os.path.dirname(__file__), '../data')))

books = []

for filename in os.listdir(booksDir):
    if filename.endswith('.txt') and fnmatch.fnmatch(filename, "2???_a_*"):
        books.append(filename)
    if len(books) > 100:
        break

for filename in books:
    book_content = open(os.path.join(booksDir, filename),
                        encoding="utf-8").read()

    if os.path.exists(
コード例 #15
0
 print("Found {} local models".format(len(local_models)))
 
 
 model_count = len(online_models)
 
 # step 1. download all models
 for online_model in online_models:        
     model, version = online_model[0], online_model[1]
     if not online_model in local_models:
         print("Downloading {}-{}".format(model,version))
     else:
         print("Model {}-{} is already downloaded.".format(model,version))
         continue
     cube = Cube()
     #cube.load(model, version)???
     cube.load(model)
  
 print("\n\n")
 for online_model in online_models:
     model, version = online_model[0], online_model[1]
     print("\n\nTesting model {}-{}, @{}".format(model,version, datetime.today()))
     if model == "pl":
         continue
     
     # go run Cube
     print("\t Reading metadata ...")        
     metadata = ModelMetadata()        
     metadata.read(os.path.join(local_model_path,model+"-"+str(version),"metadata.json"))
     
     
     mlanguage = metadata.language
コード例 #16
0
ファイル: ignore.py プロジェクト: dumitrescustefan/ronec
    's', 'x', 's', 's', 's', 's', 's', 's', 'm', 's', 's', 's', 's', 'm', 's',
    's', 's', 'm', 'm', 's', 's', 's', 's', 's', 's', 's', 's', 'm', 's', 'm',
    'm', 's', 's', 'm', 's', 's', 's', 'm', 'm', 's', 's', 'm', 's', 's', 'm',
    's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's',
    's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 'm', 's', 's',
    's', 's', 's', 's', 's', 'm', 's', 's', 'm', 's', 's', 's', 's', 's', 's',
    'm', 's', 'm', 's', 's', 's', 's', 'm', 's', 's', 's', 'm', 'm', 's', 's',
    's', 'm'
]

from cube.api import Cube

cube = Cube(verbose=True)
cube.load("ro",
          tokenization=True,
          compound_word_expanding=False,
          tagging=True,
          lemmatization=True,
          parsing=True)
cube_no_tok = Cube(verbose=True)
cube_no_tok.load("ro",
                 tokenization=False,
                 compound_word_expanding=False,
                 tagging=True,
                 lemmatization=True,
                 parsing=True)
multi_sentence_count = 0
errors = 0
multi_sentences = []
conllupdataset = []
for sentence in sentences:
    if "Alege: [" in sentence.sentence or "Decide tipul/clasa corecta: [" in sentence.sentence or len(
コード例 #17
0
ファイル: main.py プロジェクト: raduenuca/text-summarization
def cli(ctx, language):
    cube = Cube(verbose=True)
    cube.load(language)

    ctx.ensure_object(dict)
    ctx.obj['CUBE'] = cube
コード例 #18
0
def map_bibles(f1,
               f2s,
               voc,
               l1='gothic',
               cube1=True,
               cube2=False,
               lemmatizer={},
               expand_voc=False):
    f1_dict = load_bible(open(f1, 'r'))

    lemma_l1 = False if cube1 == False else True

    count = 0
    mapped = {}

    all_words_count = 0
    found_words = {}
    unfound_words = {}

    for book in f1_dict:
        if book not in mapped:
            mapped[book] = {}
        for chapter in f1_dict[book]:
            if chapter not in mapped[book]:
                mapped[book][chapter] = {}
            for verse in f1_dict[book][chapter]:
                if verse not in mapped[book][chapter]:
                    mapped[book][chapter][verse] = {}

                count += 1
                if count % 500 == 0:
                    print(count)

                mapped[book][chapter][verse][l1] = f1_dict[book][chapter][
                    verse]
                if lemma_l1:
                    mapped[book][chapter][verse][l1 + '_analyzed'] = analyze(
                        f1_dict[book][chapter][verse], cube1, lang='got')
                else:
                    mapped[book][chapter][verse][
                        l1 + '_analyzed'] = fake_analyze(
                            f1_dict[book][chapter][verse],
                            lang='got',
                            lemmatizer=lemmatizer)

                if l1 == 'got':
                    mapped[book][chapter][verse][l1 + '_translation'] = {}
                    lemmas = list(
                        list(
                            zip(*mapped[book][chapter][verse][
                                l1 + '_analyzed']))[2])

                    for lang in all_languages:
                        if lang not in found_words:
                            found_words[lang] = {}
                        if lang not in unfound_words:
                            unfound_words[lang] = {}

                        if lang == 'Got':
                            mapped[book][chapter][verse][
                                l1 + '_translation'][lang] = lemmas
                            mapped[book][chapter][verse][l1 + '_translation'][
                                lang + '_script'] = [
                                    gothic_script_transformer(t)
                                    for t in lemmas
                                ]
                            mapped[book][chapter][verse][l1 + '_translation'][
                                lang + '_ipa'] = [
                                    ipa_transformer(t, 'gothic')
                                    for t in lemmas
                                ]
                        else:
                            if lang not in mapped[book][chapter][verse][
                                    l1 + '_translation']:
                                mapped[book][chapter][verse][
                                    l1 + '_translation'][lang] = []

                            for word in lemmas:
                                all_words_count += 1
                                if word in voc and lang in voc[word]:
                                    #if lang == 'Lat':
                                    #	print(word, voc[word][lang])

                                    if word not in found_words[lang]:
                                        found_words[lang][word] = 0
                                    found_words[lang][word] += 1
                                    mapped[book][chapter][verse][
                                        l1 + '_translation'][lang].append(
                                            voc[word][lang])
                                else:
                                    if word not in unfound_words[lang]:
                                        unfound_words[lang][word] = 0
                                    unfound_words[lang][word] += 1
                                    mapped[book][chapter][verse][
                                        l1 + '_translation'][lang].append([])

    for lang in found_words:
        found = sum([found_words[lang][w] for w in found_words[lang]])
        unfound = sum([unfound_words[lang][w] for w in unfound_words[lang]])
        print('token', lang, found, unfound,
              '%.4f' % (float(found) / (found + unfound + 1)))
        found = len(found_words[lang])
        unfound = len(unfound_words[lang])
        print('\ttype', lang, found, unfound,
              '%.4f' % (float(found) / (found + unfound + 1)))
    print('All words: {}'.format(all_words_count))

    for f in f2s:
        l2 = f.split('/')[0]

        langs_epitran = {
            'german': 'deu-Latn',
            'italian': 'ita-Latn',
            'latin': 'ita-Latn',
            'spanish': 'spa-Latn',
            'english': 'eng-Latn'
        }
        if l2 in langs_epitran:
            epi = epitran.Epitran(langs_epitran[l2])

        if cube2:
            lemma_l2 = lemma_l1
            cube2 = Cube(verbose=False)
            cube2.load(lang_acron[l2])

        #if f == 'greek/greek_byzantine_2000_utf8.txt':
        #	pdb.set_trace()
        f2_dict = load_bible(open(f, 'r'))
        lemma_l2 = False if cube1 == False else True

        for book in mapped:
            for chapter in mapped[book]:
                for verse in mapped[book][chapter]:
                    if book not in f2_dict or chapter not in f2_dict[
                            book] or verse not in f2_dict[book][chapter]:
                        #pdb.set_trace()
                        continue

                    if f2_dict[book][chapter][verse] in ['', '[]', []]:
                        continue
                    #print(f2_dict[book][chapter][verse])

                    #mapped_words = {'german':'deu-Latn', 'italian':'ita-Latn', 'latin':'ita-Latn', 'spanish':'Es', 'greek':'Gre'}
                    if lemma_l2:
                        mapped[book][chapter][verse][
                            l2 + '_analyzed'] = analyze(
                                f2_dict[book][chapter][verse], cube2, lang=l2)
                    else:
                        mapped[book][chapter][verse][
                            l2 + '_analyzed'] = fake_analyze(
                                f2_dict[book][chapter][verse], lang=l2)

                    # IPA
                    #try:
                    lemmas = [
                        x for x in (list(
                            zip(*mapped[book][chapter][verse][l2 +
                                                              '_analyzed']))[2]
                                    )
                    ]
                    #except:
                    #pdb.set_trace()
                    mapped[book][chapter][verse][l2] = lemmas
                    if l2 == 'greek':
                        mapped[book][chapter][verse][l2 + '_ipa'] = [
                            ipa_transformer(l, 'greek') for l in lemmas
                        ]  #.split()
                    elif l2 == 'old_english':
                        try:
                            mapped[book][chapter][verse][l2 + '_ipa'] = [
                                oe(l) for l in lemmas
                            ]  #.split()
                        except:
                            pass
                    elif l2 == 'english':
                        try:
                            mapped[book][chapter][verse][l2 + '_ipa'] = [
                                oe(l) for l in lemmas
                            ]  #.split() #gen_mods.get_final(gen_mods.getIPA_CMU(f2_dict[book][chapter][verse]))
                        except:
                            pass
                    else:
                        mapped[book][chapter][verse][l2 + '_ipa'] = [
                            epi.transliterate(l) for l in lemmas
                        ]  #.split()

                    expand_voc = False
                    if l2 != 'Got' and expand_voc:
                        expand_voc_by_distance(voc, mapped, book, chapter,
                                               verse, l2)
                        #pdb.set_trace()

    return mapped
コード例 #19
0
ファイル: CubeNLPInst.py プロジェクト: mcyph/pos_tagger
            for entry in LSentence:
                print(entry)
            print("")


if __name__ == '__main__':
    if True:
        found_nl = False
        for iso in CubeNLPPOS.get_L_supported_isos(None):
            if iso not in ('nno', 'nnb'):
                continue

            from cube.api import Cube  # import the Cube object

            cube = Cube(verbose=True)  # initialize it
            cube.load(DSupportedISOs[iso])

    print_pos(
        'id',
        'Tahap pertama konflik ini dapat disebut "Perang Kemerdekaan Belanda".'
    )
    print_pos(
        'en',
        'The first phase of the conflict can be considered the Dutch War of Independence.'
    )

    print_pos('id', 'Saya tidak dapat memakan ini.')
    print_pos('en', 'I can\'t eat this.')

    print_pos('zh', '猴子高兴,实验人员也高兴。')
    print_pos('en', 'The monkeys were happy and the experimenters were happy.')
コード例 #20
0
ファイル: tasks.py プロジェクト: Luiscri/MABSED
# filter
import os
import csv
import json
from filter.filter import filter_spam

# lemmatizer
from lemmatizer.lemmatizer import lemmatize
from cube.api import Cube

# detection
sys.path.insert(0, './detector/')
from detect_events import main as detect_events

lemmatizer = Cube(verbose=True)
lemmatizer.load("es", tokenization=False, parsing=False)


class Streamer(luigi.ExternalTask):
    time_slice = luigi.parameter.DateMinuteParameter(interval=30)

    def output(self):
        fname = '../data/streaming/{}.csv'.format(self.time_slice)
        # print('Requires: {}'.format(fname))
        return luigi.LocalTarget(fname)


class Preprocess(luigi.Task):
    time_slice = luigi.parameter.DateMinuteParameter(
        interval=30, default=datetime.datetime.today())
コード例 #21
0
from cube.api import Cube

cube = Cube(verbose=True)

cube.load("en", local_models_repository="/mnt/d/nlpcube/")

text = "One potential microRNA that regulates Bcan is miR-9 and overexpression of miR-9 can partly rescue the effects of Dicer1 deletion on the MG phenotype."

sentences = cube(text)

for sentence in sentences:
    for entry in sentence:
        print(
            str(entry.index) + "\t" + entry.word + "\t" + entry.lemma + "\t" +
            entry.upos + "\t" + entry.xpos + "\t" + entry.attrs + "\t" +
            str(entry.head) + "\t" + str(entry.label) + "\t" +
            entry.space_after)
    print("")
コード例 #22
0
import tnkeeh as tn
from farasa.segmenter import FarasaSegmenter

# for bulgarian and turkish
from cube.api import Cube

en_tok = MosesTokenizer(lang="en")
en_normalizer = MosesPunctNormalizer()
# TODO: change hardcoding of jar file to a arg from cli
rdrsegmenter = VnCoreNLP("vncorenlp/VnCoreNLP-1.1.1.jar",
                         annotators="wseg",
                         max_heap_size="-Xmx500m")
ar_segmenter = FarasaSegmenter()

bg_cube = Cube(verbose=False)
bg_cube.load("bg")

tr_cube = Cube(verbose=False)
tr_cube.load("tr")


def clean_ar_text(
    text,
    segment=False,
    remove_special_chars=False,
    remove_english=False,
    normalize=False,
    remove_diacritics=False,
    excluded_chars=[],
    remove_tatweel=False,
    remove_html_elements=False,
コード例 #23
0
    local_models = model_store_object.list_local_models()
    print("Found {} local models".format(len(local_models)))

    model_count = len(online_models)

    # step 1. download all models
    for online_model in online_models:
        model, version = online_model[0], online_model[1]
        if not online_model in local_models:
            print("Downloading {}-{}".format(model, version))
        else:
            print("Model {}-{} is already downloaded.".format(model, version))
            continue
        cube = Cube()
        cube.load(model, version, local_models_repository=local_model_path)
        #cube.load(model)

    print("\n\n")
    #for online_model in local_models: #local_models+online_models:
    for online_model in local_models + online_models:
        model, version = online_model[0], online_model[1]
        print("\n\nTesting model {}-{}, @{}".format(model, version,
                                                    datetime.today()))
        if model == "pl":
            continue

        # go run Cube
        print("\t Reading metadata ...")
        metadata = ModelMetadata()
        metadata.read(
コード例 #24
0
from cube.api import Cube

cube = Cube(verbose=True)
cube.load(
    'ro', tokenization=True,
    compound_word_expanding=False,
    tagging=True,
    lemmatization=True,
    parsing=True
)
コード例 #25
0
import pandas as pd
import os
import sys
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import xml.etree.ElementTree as ET
import numpy as np
import nltk
import string
from nltk.stem import PorterStemmer, SnowballStemmer

from cube.api import Cube
cube = Cube(verbose=True)
cube.load("ro")

#import baza
df = pd.read_csv('pcgarage.csv', delimiter='\t', encoding='utf-16', header=0)

#preprocesare
#eliminarea elementelor nedorite
df['pro'] = df['pro'].str[4:]
df['contra'] = df['contra'].str[8:]
df['altele'] = df['altele'].str[8:]
#concatenarea partilor
df["corpus"] = df["pro"].astype(str) + " " + df["contra"].astype(
    str) + " " + df["altele"].astype(str)
data = df[['product', 'rating', 'corpus']].copy()
data['corpus'] = [it.lower().replace('\n\n', ' ') for it in data['corpus']]
#transformarea majusculelor
data['corpus'] = data.corpus.map(lambda x: x.lower())
#tokenizarea
コード例 #26
0
ファイル: text_tokenize.py プロジェクト: cephcyn/PICO_Parser
# implement NLP-Cube https://github.com/adobe/NLP-Cube for sent and word tokenization, and lemma
from cube.api import Cube  # import the Cube object
cube = Cube(verbose=False)  # initialize it
cube.load("en")
from rusenttokenize import ru_sent_tokenize

import re


class mytokenizer:
    def bracket_mask(self, text):

        text_temp = re.sub("\[", "<<", text)
        text_temp = re.sub("\]", ">>", text_temp)
        text_temp = re.sub("\+", "===", text_temp)
        parenthesis = re.findall("\([^()]*\)", text_temp)
        bracket = re.findall("\[[^\[\]]*\]", text)
        if not parenthesis and not bracket:
            return text

        for p in parenthesis:
            p = re.sub("[\(\)]", "", p)
            parts = [re.sub("\.\s*$", "", s) for s in ru_sent_tokenize(p)]
            new_p = " ; ".join(parts)
            #print ("P:", p)
            #print("new_p:",new_p,"\n")
            #print("text_temp:",text_temp)
            text_temp = re.sub(p, new_p, text_temp)

        text = re.sub("<<", "[", text_temp)
        text = re.sub(">>", "]", text)
コード例 #27
0
####################################################################################
if recompute_histograms or not os.path.isfile(histogram_picklefile_gf %
                                              (category, FREQUENCY_THRESH)):
    if category == "All":
        D_coords_fixated, D_histogram, D_entropy, D_entropy_df = \
         merge_histograms(histogram_picklefile_gf)
        imnames = list(set(D_coords_fixated.keys()))
        pickle.dump([D_coords_fixated, D_histogram, D_entropy, D_entropy_df],
                    open(
                        histogram_picklefile_gf % (category, FREQUENCY_THRESH),
                        "wb"))
    else:
        D_coords_fixated = get_raw_data()
        imnames = list(set(D_coords_fixated.keys()))
        cube = Cube(verbose=True)
        cube.load('en')
        start = time.time()
        D_histogram, D_entropy, D_entropy_df = compute_histograms(
            D_coords_fixated,
            imnames,
            category,
            is_grouping=True,
            fre_threshold=FREQUENCY_THRESH)
        print(time.time() - start)
        pickle.dump([D_coords_fixated, D_histogram, D_entropy, D_entropy_df],
                    open(
                        histogram_picklefile_gf % (category, FREQUENCY_THRESH),
                        "wb"))
        print(histogram_picklefile_gf % (category, FREQUENCY_THRESH))
else:
    with open(histogram_picklefile_gf % (category, FREQUENCY_THRESH),
コード例 #28
0
    def load_model(self):
        if self.lib.lower() == "stanford":
            print(
                "-----------You are going to use Stanford library-----------")
            if self.lang.lower() == "basque":
                print(
                    "-------------You are going to use Basque model-------------"
                )
                # config = {'processors': 'tokenize,pos,lemma,depparse',  # Comma-separated list of processors to use
                #            'lang': 'eu',  # Language code for the language to build the Pipeline in
                #            'tokenize_model_path': '/home/kepa/eu/eu_bdt_models/eu_bdt_tokenizer.pt',
                #            # Processor-specific arguments are set with keys "{processor_name}_{argument_name}"
                #            'pos_model_path': '/home/kepa/eu/eu_bdt_models/eu_bdt_tagger.pt',
                #            'pos_pretrain_path': '/home/kepa/eu/eu_bdt_models/eu_bdt.pretrain.pt',
                #            'lemma_model_path': '/home/kepa/eu/eu_bdt_models/eu_bdt_lemmatizer.pt',
                #            'depparse_model_path': '/home/kepa/eu/eu_bdt_models/eu_bdt_parser.pt',
                #            'depparse_pretrain_path': '/home/kepa/eu/eu_bdt_models/eu_bdt.pretrain.pt'
                #            }
                config = {
                    'processors':
                    'tokenize,pos,lemma,depparse',  # Comma-separated list of processors to use
                    'lang':
                    'eu',  # Language code for the language to build the Pipeline in
                    'tokenize_model_path':
                    'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt_tokenizer.pt',
                    # Processor-specific arguments are set with keys "{processor_name}_{argument_name}"
                    'pos_model_path':
                    'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt_tagger.pt',
                    'pos_pretrain_path':
                    'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt.pretrain.pt',
                    'lemma_model_path':
                    'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt_lemmatizer.pt',
                    'depparse_model_path':
                    'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt_parser.pt',
                    'depparse_pretrain_path':
                    'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt.pretrain.pt'
                }
                self.parser = stanfordnlp.Pipeline(**config)

            elif self.lang.lower() == "english":
                print(
                    "-------------You are going to use English model-------------"
                )
                config = {
                    'processors':
                    'tokenize,mwt,pos,lemma,depparse',  # Comma-separated list of processors to use
                    'lang':
                    'en',  # Language code for the language to build the Pipeline in
                    'tokenize_model_path':
                    '/home/kepa/en/en_ewt_models/en_ewt_tokenizer.pt',  # Processor-specific arguments are set with keys "{processor_name}_{argument_name}"
                    #'mwt_model_path': './fr_gsd_models/fr_gsd_mwt_expander.pt',
                    'pos_model_path':
                    '/home/kepa/en/en_ewt_models/en_ewt_tagger.pt',
                    'pos_pretrain_path':
                    '/home/kepa/en/en_ewt_models/en_ewt.pretrain.pt',
                    'lemma_model_path':
                    '/home/kepa/en/en_ewt_models/en_ewt_lemmatizer.pt',
                    'depparse_model_path':
                    '/home/kepa/en/en_ewt_models/en_ewt_parser.pt',
                    'depparse_pretrain_path':
                    '/home/kepa/en/en_ewt_models/en_ewt.pretrain.pt'
                }
                self.parser = stanfordnlp.Pipeline(**config)
            elif self.lang.lower() == "spanish":
                print(
                    "-------------You are going to use Spanish model-------------"
                )
                config = {
                    'processors':
                    'tokenize,pos,lemma,depparse',  # Comma-separated list of processors to use
                    'lang':
                    'es',  # Language code for the language to build the Pipeline in
                    'tokenize_model_path':
                    '/home/kepa/es/es_ancora_models/es_ancora_tokenizer.pt',  # Processor-specific arguments are set with keys "{processor_name}_{argument_name}"
                    'pos_model_path':
                    '/home/kepa/es/es_ancora_models/es_ancora_tagger.pt',
                    'pos_pretrain_path':
                    '/home/kepa/es/es_ancora_models/es_ancora.pretrain.pt',
                    'lemma_model_path':
                    '/home/kepa/es/es_ancora_models/es_ancora_lemmatizer.pt',
                    'depparse_model_path':
                    '/home/kepa/es/es_ancora_models/es_ancora_parser.pt',
                    'depparse_pretrain_path':
                    '/home/kepa/es/es_ancora_models/es_ancora.pretrain.pt'
                }
                self.parser = stanfordnlp.Pipeline(**config)
            else:
                print("........You cannot use this language...........")
        elif self.lib.lower() == "cube":
            print("-----------You are going to use Cube Library-----------")
            if self.lang.lower() == "basque":
                #load(self, language_code, version="latest",local_models_repository=None,
                #local_embeddings_file=None, tokenization=True, compound_word_expanding=False,
                #tagging=True, lemmatization=True, parsing=True).
                #Ejemplo:load("es",tokenization=False, parsing=False)
                cube = Cube(verbose=True)
                cube.load("eu", "latest")
            elif self.lang.lower() == "english":
                cube = Cube(verbose=True)
                cube.load("en", "latest")
            elif self.lang.lower() == "spanish":
                cube = Cube(verbose=True)
                cube.load("es", "latest")
            else:
                print("........You cannot use this language...........")
        else:
            print(
                "You cannot use this library. Introduce a valid library (Cube or Stanford)"
            )
コード例 #29
0
from typing import List, Dict
from urllib.parse import urlparse, parse_qs
import json

from abbrev import full_to_abbrev
from conllu_msd_to_monomial import MSD_dict
from msd_convert import UPOS_to_MSD, MSD_to_attribs

hostName = "localhost"
serverPort = 8080
QUERY = 'q'

from cube.api import Cube

ro_cube=Cube(verbose=True)         # initialize it
ro_cube.load("ro")                 # select the desired language (it will auto-download the model on first run)


class POSTagRequestHandler(http.server.BaseHTTPRequestHandler):
    def do_GET(self):
        parse = urlparse(self.path)
        query = parse_qs(parse.query)
        query = {k:' '.join(query[k]) for k in query.keys()}
        self.send_response(200)
        self.send_header("Content-type", "text/html")
        self.end_headers()
        # self.wfile.write(bytes("<html><head><title>https://pythonbasics.org</title></head>", "utf-8"))
        self.wfile.write(bytes(self._process_input(query[QUERY]), "utf-8"))
        # self.wfile.write(bytes("</body></html>", "utf-8"))

    def end_headers(self):
コード例 #30
0
def create_pickle():
    cube = Cube(verbose=True)
    cube.load("ja")
    with open('cube.pickle', mode='wb') as wh:
        pickle.dump(cube, wh)