def sentence_segmentation(article, lang_code): if lang_code == "en": nlp = en_core_web_sm.load() if lang_code == "hi": nlp = Hindi() nlp.add_pipe(nlp.create_pipe("sentencizer")) doc = nlp(article) sentences = [i for i in doc.sents] sentences = [str(i) for i in sentences] return sentences
def tokenize_hin(article): nlp = Hindi() doc = nlp(article) tokens = [token.text for token in doc] return tokens
import time import math import pickle import logging from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, SmoothingFunction from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence import torch.nn.functional as F import sys #/home/tushar.abhishek/nlpa/assignment-2 MODEL_NAME = 'attention-seq2seq(dot)' CACHE_DIR = "/home/tushar/Desktop/MS/sem 2/nlpa/assignment-2/saved_models/effective_attention" #smoothing_function smoothie = SmoothingFunction() spacy_en, spacy_hi = English(), Hindi() log_file = os.path.join(CACHE_DIR, "%s.log" % MODEL_NAME) #logging to a file logging.basicConfig(filename=os.path.abspath(log_file), filemode='w', level=logging.DEBUG, format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') #logging to standard output logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) class EncoderRNN(nn.Module): def __init__(self, input_dim, embedding_dim, hidden_dim, layer_count, dropout_rate): super().__init__()
def test_issue3625(): """Test that default punctuation rules applies to hindi unicode characters""" nlp = Hindi() doc = nlp("hi. how हुए. होटल, होटल") expected = ["hi", ".", "how", "हुए", ".", "होटल", ",", "होटल"] assert [token.text for token in doc] == expected
def test_issue3625(): """Test that default punctuation rules applies to hindi unicode characters""" nlp = Hindi() doc = nlp(u"hi. how हुए. होटल, होटल") assert [token.text for token in doc ] == ['hi', '.', 'how', 'हुए', '.', 'होटल', ',', 'होटल']