def sentence_segmentation(article, lang_code):
    if lang_code == "en":
        nlp = en_core_web_sm.load()
    if lang_code == "hi":
        nlp = Hindi()
    nlp.add_pipe(nlp.create_pipe("sentencizer"))
    doc = nlp(article)
    sentences = [i for i in doc.sents]
    sentences = [str(i) for i in sentences]
    return sentences
def tokenize_hin(article):
    nlp = Hindi()
    doc = nlp(article)
    tokens = [token.text for token in doc]

    return tokens
Exemple #3
0
import time
import math
import pickle
import logging
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, SmoothingFunction
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.nn.functional as F
import sys

#/home/tushar.abhishek/nlpa/assignment-2
MODEL_NAME = 'attention-seq2seq(dot)'
CACHE_DIR = "/home/tushar/Desktop/MS/sem 2/nlpa/assignment-2/saved_models/effective_attention"

#smoothing_function
smoothie = SmoothingFunction()
spacy_en, spacy_hi = English(), Hindi()
log_file = os.path.join(CACHE_DIR, "%s.log" % MODEL_NAME)
#logging to a file
logging.basicConfig(filename=os.path.abspath(log_file),
                    filemode='w',
                    level=logging.DEBUG,
                    format='%(asctime)s %(message)s',
                    datefmt='%m/%d/%Y %I:%M:%S %p')
#logging to standard output
logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))


class EncoderRNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, layer_count,
                 dropout_rate):
        super().__init__()
Exemple #4
0
def test_issue3625():
    """Test that default punctuation rules applies to hindi unicode characters"""
    nlp = Hindi()
    doc = nlp("hi. how हुए. होटल, होटल")
    expected = ["hi", ".", "how", "हुए", ".", "होटल", ",", "होटल"]
    assert [token.text for token in doc] == expected
Exemple #5
0
def test_issue3625():
    """Test that default punctuation rules applies to hindi unicode characters"""
    nlp = Hindi()
    doc = nlp(u"hi. how हुए. होटल, होटल")
    assert [token.text for token in doc
            ] == ['hi', '.', 'how', 'हुए', '.', 'होटल', ',', 'होटल']