Python translate Examples, keras.preprocessing.text.translate Python Examples

Example #1

0

Show file

def sent_list(docs, splitStr='__label__'):
    sent_analysis = []
    for i in range(1, len(docs)):
        text = str(lines[i])
        splitText = text.split(splitStr)
        secHalf = splitText[1]
        sentiment = secHalf[0]
        text = secHalf[2:len(secHalf) - 1].lower()
        table = str.maketrans(' ', ' ', string.punctuation)
        text.translate(table)
        if 'www.' in text or 'http:' in text or 'https:' in text or '.com' in text:
            text = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", text)
        sent_analysis.append([text, sentiment])
    return sent_analysis

Example #2

0

Show file

File: gru11.py Project: fxcebx/fast-furious

def text_to_word_sequence(text,
                          filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                          lower=True, split=" "):
    """Converts a text to a sequence of words (or tokens).
    # Arguments
        text: Input text (string).
        filters: Sequence of characters to filter out.
        lower: Whether to convert the input to lowercase.
        split: Sentence split marker (string).
    # Returns
        A list of words (or tokens).
    """
    if lower:
        text = text.lower()

    if sys.version_info < (3,) and isinstance(text, unicode):
        translate_map = dict((ord(c), unicode(split)) for c in filters)
    else:
        translate_map = maketrans(filters, split * len(filters))

    text = text.translate(translate_map)
    #seq = text.split(split)
    seq = text.split()
    
    #seq = word_tokenize(text)
    #print("text:",seq)
    
    #pos_seq = nltk.pos_tag(text)
    #return [i for i in seq if i]
    return nltk.pos_tag(seq)

Example #3

0

Show file

def _remove_pattern_2(input_text_list):
    stoplist = read_stopwords()

    cleaned_text_list = []
    for text in input_text_list:
        text = text.translate(string.punctuation)  # Remove puncuation 去除标点
        text = text.lower()  # Convert words to lower case and split them

        # text = " ".join(text)

        # Clean the text
        text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ",
                      text)  # 除A-Za-z0-9(),!?'`外的字符，去除
        text = re.sub(r"what's", "what is ", text)
        text = re.sub(r"\'s", " ", text)
        text = re.sub(r"\'ve", " have ", text)
        text = re.sub(r"n't", " not ", text)
        text = re.sub(r"i'm", "i am ", text)
        text = re.sub(r"\'re", " are ", text)
        text = re.sub(r"\'d", " would ", text)
        text = re.sub(r"\'ll", " will ", text)
        text = re.sub(r",", " ", text)
        text = re.sub(r"\.", " ", text)
        text = re.sub(r"!", " ! ", text)
        text = re.sub(r"\/", " ", text)
        text = re.sub(r"\^", " ^ ", text)
        text = re.sub(r"\+", " + ", text)
        text = re.sub(r"\-", " - ", text)
        text = re.sub(r"\=", " = ", text)
        text = re.sub(r"'", " ", text)
        text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
        text = re.sub(r":", " : ", text)
        text = re.sub(r" e g ", " eg ", text)
        text = re.sub(r" b g ", " bg ", text)
        text = re.sub(r" u s ", " american ", text)
        text = re.sub(r"\0s", "0", text)
        text = re.sub(r" 9 11 ", "911", text)
        text = re.sub(r"e - mail", "email", text)
        text = re.sub(r"j k", "jk", text)
        text = re.sub(r"\s{2,}", " ", text)
        text = re.sub(r"https://t.co/[A-Za-z]{10}", " ", text)

        text = text.split()

        text = [word for word in text
                if word not in stoplist]  ## 在提取词根前清除一次停用词

        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]

        cleanwordlist = [
            word for word in stemmed_words if word not in stoplist
        ]  ## 提取词根后，再清除

        text = " ".join(cleanwordlist)

        cleaned_text_list.append(text)
    return cleaned_text_list

Example #4

0

Show file

def bigram_text_to_word_sequence(text, bigram, filters=base_filter(), lower=False, split=" "):
    '''prune: sequence of characters to filter out
    '''
    if lower:
        text = text.lower()
    text = text.translate(string.maketrans(filters, split*len(filters)))
    seq = text.split(split)
    sentences = [_f for _f in seq if _f]
    return bigram(sentences)

Example #5

0

Show file

File: prepare_data.py Project: aiedward/kaggle-quora-question-pairs-5

def text_to_word_sequence(text,
                          filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                          lower=True, split=" "):
    if lower: text = text.lower()
    if type(text) == unicode:
        translate_table = {ord(c): ord(t) for c, t in zip(filters, split * len(filters))}
    else:
        translate_table = maketrans(filters, split * len(filters))
    text = text.translate(translate_table)
    seq = text.split(split)
    return [i for i in seq if i]

Example #6

0

Show file

File: Utilities.py Project: vincent770608/delft

def translate(comment):
    if hasattr(comment, "decode"):
        comment = comment.decode("utf-8")

    text = TextBlob(comment)
    try:
        text = text.translate(to="en")
    except NotTranslated:
        pass

    return str(text)

Example #7

0

Show file

def _remove_pattern_2(input_text_list):

    cleaned_text_list = []
    for text in input_text_list:
        text = text.translate(string.punctuation)  # Remove puncuation 去除标点
        text = text.lower()  # Convert words to lower case and split them

        # Remove stop words
        # text = text.split()
        # stops = set(stopwords.words("english"))
        # text = [w for w in text if not w in stops and len(w) >= 3]

        # text = " ".join(text)

        # Clean the text
        text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ",
                      text)  # 除A-Za-z0-9(),!?'`外的字符，去除
        text = re.sub(r"what's", "what is ", text)
        text = re.sub(r"\'s", " ", text)
        text = re.sub(r"\'ve", " have ", text)
        text = re.sub(r"n't", " not ", text)
        text = re.sub(r"i'm", "i am ", text)
        text = re.sub(r"\'re", " are ", text)
        text = re.sub(r"\'d", " would ", text)
        text = re.sub(r"\'ll", " will ", text)
        text = re.sub(r",", " ", text)
        text = re.sub(r"\.", " ", text)
        text = re.sub(r"!", " ! ", text)
        text = re.sub(r"\/", " ", text)
        text = re.sub(r"\^", " ^ ", text)
        text = re.sub(r"\+", " + ", text)
        text = re.sub(r"\-", " - ", text)
        text = re.sub(r"\=", " = ", text)
        text = re.sub(r"'", " ", text)
        text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
        text = re.sub(r":", " : ", text)
        text = re.sub(r" e g ", " eg ", text)
        text = re.sub(r" b g ", " bg ", text)
        text = re.sub(r" u s ", " american ", text)
        text = re.sub(r"\0s", "0", text)
        text = re.sub(r" 9 11 ", "911", text)
        text = re.sub(r"e - mail", "email", text)
        text = re.sub(r"j k", "jk", text)
        text = re.sub(r"\s{2,}", " ", text)

        # text = text.split()
        # stemmer = SnowballStemmer('english')
        # stemmed_words = [stemmer.stem(word) for word in text]
        # text = " ".join(stemmed_words)

        cleaned_text_list.append(text)
    return cleaned_text_list

Example #8

0

Show file

def clean_text(text):
    ## Remove puncuation
    text = str(text)
    text = text.translate(string.punctuation)

    ## Convert words to lower case and split them
    text = text.lower().split()

    ## Remove stop words
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops and len(w) >= 3]

    text = " ".join(text)
    ## Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    ## Stemming
    text = text.split()
    stemmer = SnowballStemmer('english')
    stemmed_words = [stemmer.stem(word) for word in text]
    text = " ".join(stemmed_words)
    return text

Example #9

0

Show file

File: featurizer.py Project: yyht/sentivent-economic-event-detection

def corenlp_tokenize_enpbt(text,
                           filters="!\"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n",
                           lower=True,
                           split=" "):

    if lower:
        text = text.lower()

    if sys.version_info < (3, ) and isinstance(text, unicode):
        translate_map = dict((ord(c), unicode(split)) for c in filters)
    else:
        translate_map = maketrans(filters, split * len(filters))
    text = text.translate(translate_map)

    ann = settings.CORENLP_CLIENT.annotate(text)
    return [x.word for x in ann.sentencelessToken]

Example #10

0

Show file

File: lstm.py Project: HEXA0991/Similarity_demo

def text_to_word_sequence(text,
                          filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                          lower=True,
                          split=" "):
    if lower:
        text = text.lower()


#   if isinstance(text, unicode):  #python3 字符串永远都是Unicode
    translate_table = {
        ord(c): ord(t)
        for c, t in zip(filters, split * len(filters))
    }
    #    else:
    #       translate_table = str.maketrans(filters, split * len(filters)) #空格 * int == int长度的空格
    text = text.translate(translate_table)
    seq = text.split(split)
    return [i for i in seq if i]

Example #11

0

Show file

def remove_punctuation(text):
 """custom function to remove the punctuation"""
 return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))