Example #1
0
def import_data(a_file, a_row, b_file, b_row):
    a_content = []
    a_content_1 = open(a_file, 'r')
    csv_reader_a = csv.reader(a_content_1)
    for row in csv_reader_a:
        row_new = remove_stopwords(row[a_row])
        row_new = strip_numeric(row_new)
        row_new = strip_non_alphanum(row_new)
        row_new = strip_short(row_new, minsize=3)
        a_content.append(row_new)
    a_length = len(a_content)
    a_label = np.ones(a_length)
    a_label = a_label.tolist()

    b_content = []
    b_content_1 = open(b_file, 'r')
    csv_reader_b = csv.reader(b_content_1)
    for row in csv_reader_b:
        row_new = remove_stopwords(row[a_row])
        row_new = strip_numeric(row_new)
        row_new = strip_non_alphanum(row_new)
        row_new = strip_short(row_new, minsize=3)
        b_content.append(row_new)
    b_length = len(b_content)
    b_label = np.zeros(b_length)
    b_label = b_label.tolist()

    return a_content, a_label, b_content, b_label
Example #2
0
def import_data(file):
    human = []
    machine = []
    content = open(file, 'r')
    csv_reader = csv.reader(content)
    for row in csv_reader:
        row1 = unicode(row[2], errors='ignore')
        row_new1 = remove_stopwords(row1)
        row_new1 = strip_numeric(row_new1)
        #row_new = strip_non_alphanum(row_new)
        row_new1 = strip_short(row_new1, minsize=3)
        human.append(row_new1)
        row2 = unicode(row[3], errors='ignore')
        row_new2 = remove_stopwords(row2)
        row_new2 = strip_numeric(row_new2)
        #row_new = strip_non_alphanum(row_new)
        row_new2 = strip_short(row_new2, minsize=3)
        machine.append(row_new2)

    length = len(human)
    human_label = np.ones(length)
    human_label = human_label.tolist()
    machine_label = np.zeros(length)
    machine_label = machine_label.tolist()

    return human, human_label, machine, machine_label
Example #3
0
def ALLCAPS(text):
    '''Calculates the number of ALL CAPS words at the start of the message
     after removing http addresses, numbers and multiple whitespaces

    input: 
        text: a string
    returns: 
        the number of ALL CAPS words at the start of the message
    '''
    text = preprocess.strip_numeric(text) #get rid of numbers
    p = re.compile(r'(http.*\s)|(http.*$)')
    text = p.sub('',text)
    p = re.compile(r'[^\x00-\x7F]+')
    text = p.sub('',text)
    text = preprocess.strip_multiple_whitespaces(text)
    words = text.split()
    ALLCAPScount = 0

    for w in words:
        if w.isupper() == False:
            break
        ALLCAPScount = ALLCAPScount + 1

    if ALLCAPScount:    
        if (words[ALLCAPScount-1] == 'A'):    
            ALLCAPScount = ALLCAPScount - 1

    return ALLCAPScount
Example #4
0
def getLemmatizedText(name, content, language):
  language = language[:2]
  language = language.lower()
  outText = ""
  if (language):
    if (language=="is"):
      outText = getLemmatizedTextIS(name, content)
      print("IS")
    else:
      outText = lemmatizerMultilanguage.getLemmatizedText(language, name+" "+content)
      print(language.upper())
  else:
    text = name+" "+content
    outText = text.lower().replace('.','.')
    print("ERROR: No language for Lemmatizing text")
  cleaned = re.sub(' +', ' ',outText)
  cleaned = cleaned.replace('\n', '')
  cleaned = cleaned.replace('\r', '')

  cleaned = remove_stopwords(cleaned)
  cleaned = strip_tags(cleaned)
  cleaned = strip_punctuation(cleaned)
  cleaned = strip_numeric(cleaned)
  cleaned = strip_short(cleaned, 1)
  cleaned = strip_multiple_whitespaces(cleaned)
  cleaned = cleaned.lower()

  print("Lemmatized CLEAN: "+cleaned)
  return cleaned
Example #5
0
def preprocessing(text):
    '''Preprocesses a text using standard gensim techniques: 
    removes stopwords, strips short words (1-2 characters), strips numbers, 
    strips http addresses, strips Unicode from emoji etc., lowercases everything, 
    strips extra spaces, punctuation, non-alphanumeric symbols. Also perform stemming

    input: 
        text: a string
    returns: 
        the preprocessed string.
    '''
    text = text.lower()
    text = preprocess.remove_stopwords(text) # remove stop words
    text = preprocess.strip_short(text) #get rid of short words
    text = preprocess.strip_numeric(text) #get rid of numbers
    p = re.compile(r'(http.*\s)|(http.*$)')
    text = p.sub('',text)
    p = re.compile(r'[^\x00-\x7F]+')
    text = p.sub('',text)
    text = preprocess.strip_multiple_whitespaces(text)
    text = preprocess.strip_punctuation(text)
    text = preprocess.strip_non_alphanum(text)
    text = preprocess.remove_stopwords(text)
    text = preprocess.strip_short(text)
# stemming
    words = text.split()
    stemmed_words = [stemmer.stem(word) for word in words]
    text = ' '.join(stemmed_words)

    return text
def clean_text(text):
    """ Cleans the text in the only argument in various steps 
    ARGUMENTS: text: content/title, string
    RETURNS: cleaned text, string"""
    if isfloat(text):
        try:
            if math.isnan(text):
                return ''
        except TypeError:
            print('text: {}'.format(text))
            return ''

    # Replace newlines by space. We want only one doc vector.
    text = text.replace('\n', ' ').lower()
    # Expand contractions: you're to you are and so on.
    # text = contractions.fix(text)
    # Remove stop words
    text = preprocessing.remove_stopwords(text)
    # Remove html tags and numbers: can numbers possible be useful?
    text = preprocessing.strip_tags(preprocessing.strip_numeric(text))
    # Remove punctuation -- all special characters
    text = preprocessing.strip_multiple_whitespaces(
        preprocessing.strip_punctuation(text))
    #text = re.sub(r'[^\w\s]', '', text.lower())
    # STEMMING (Porter) automatically lower-cases as well
    # To stem or not to stem, that is the question
    #text = preprocessing.stem_text(text)
    return text
Example #7
0
    def merge_elements(self, json_zone: dict):
        """
        Documentation
        Merge the zone elements in a dictionnary in order to avoid overrides
        Parameter:
            json_zone: json containing the zones info
        Out:
            merge_dictio: dictionnary of zones information merged
        """
        merge_dictio = {}
        for k in json_zone.keys():
            merge_dictio[k] = {}
            for el in json_zone[k].keys():
                merge_dictio[k][strip_non_alphanum(
                    strip_numeric(el.split('.')[0])).replace(' ', '')] = []

        keys = merge_dictio.keys()
        for k in json_zone.keys():
            for el in json_zone[k].keys():
                for merge_key in merge_dictio[k].keys():
                    if merge_key in el:
                        merge_dictio[k][merge_key] += json_zone[k][el]
                    merge_dictio[k][merge_key] = list(
                        dict.fromkeys(merge_dictio[k][merge_key]))
        return merge_dictio
def noPuncNoNumb(corpora):
    List_No_punct_numb = [[[strip_punctuation(stringa) and strip_numeric(stringa) for stringa in group] for group in
                          corpus] for corpus in corpora]

    # print("\nList_No_punct_numb:")
    # print(List_No_punct_numb)
    return List_No_punct_numb
def readFromDir(osList):
    """
    This reads the scraped raw data
    """

    textList = []

    for i in range(len(osList)):
        filesList = []
        textArray = []
        for (dirpath, dirnames, filenames) in os.walk(osList[i]):
            filesList.extend(filenames)
            os.chdir(osList[i])
            for _ in range(len(filesList)):
                with open('{}'.format(filesList[_]), 'r',
                          encoding='utf-8') as file:
                    text_str = file.read()
                    textArray.append(text_str.lower())

            text_arr = ','.join(textArray)
            text_arr = strip_punctuation(text_arr)
            text_arr = strip_numeric(text_arr)
            text_arr = strip_non_alphanum(text_arr)
            textList.append(text_arr)

        os.chdir('..')

    return textList
Example #10
0
def sentence_tokenize_and_word_tokenize_and_remove_stop_words(
        text, tokenizer, stop_word1, stop_word2):
    try:
        if isinstance(text, str):
            sentences = tokenizer.tokenize(text.lower())
        else:
            sentences = tokenizer.tokenize(str(text).lower())
    except UnicodeDecodeError as e:
        return ''
    if len(sentences) == 0:
        return ''
    text_total = ''
    for sentence in sentences:
        words = sentence.split()
        if len(words) == 0:
            continue
        text = ' '.join(filter(lambda x: x not in stop_word1, words))
        try:
            text = preprocessing.strip_punctuation(text)
            text = preprocessing.strip_non_alphanum(text)
            text = preprocessing.strip_numeric(text)
            text = preprocessing.strip_tags(text)
            text = preprocessing.strip_multiple_whitespaces(text)
            words = text.split()
            if len(words) == 0:
                continue
            text = ' '.join(filter(lambda x: x not in stop_word2, words))
            text_total = text_total + text.encode('utf-8') + '#'
        except UnicodeDecodeError as e:
            pass
    return text_total
Example #11
0
def file_read_csv(path, txt_column=[]):
    print("Pickle File I/O Example - text Read")
    myfile = open(path, "r")
    text = myfile.read()
    text = strip_non_alphanum(text)
    text = strip_numeric(text)
    return [text]
    def clean_text (self, text_tag, processes = ["urls", "punctuation", "numeric", "lower"]):
        text = self.texts [text_tag]

        #print (text)

        if "urls" in processes:
            text = [re.sub(r"(?:\@|https?\://)\S+", "", str(x)) for x in text]
            text = [re.sub(r' +', ' ', str(x)) for x in text]
        if "stopwords" in processes:
            text = [remove_stopwords (x) for x in text]
        if "punctuation" in processes:
            text = [strip_punctuation(x) for x in text]
        if "numeric" in processes:
            text = [strip_numeric(x) for x in text]

        text = [x.replace('"', "") for x in text]
        text = [x.replace('©', "") for x in text]
        text = [x.replace('\n', " ") for x in text]
        text = [x.replace('\r', ".") for x in text]
        text = [x.replace('QT', " ") for x in text]
        text = [x.replace('RT', " ") for x in text]
        text = [x.replace('#', " ") for x in text]
        text = [strip_multiple_whitespaces(x) for x in text]
        text = [x.strip() for x in text]

        if "lower" in processes:
            text = [x.lower() for x in text]
        # clean_text = [nltk.sent_tokenize (x) for x in  clean_text]

        self.texts[text_tag] = text
def preprocess_text(corpus=[]):
    print("Preprocessing Corpus from list data structure")
    for i, val in enumerate(corpus):  #iterate through list
        corpus[i] = corpus[i].strip('\n')
        corpus[i] = strip_punctuation(corpus[i])
        corpus[i] = strip_non_alphanum(corpus[i])
        corpus[i] = strip_numeric(corpus[i])
    return corpus
Example #14
0
def raw_text_preprocess(raw):
    raw = re.sub(r"http\S+", "", raw)
    raw = strip_non_alphanum(raw).lower().strip()
    raw = split_alphanum(raw)
    raw = strip_short(raw, minsize=2)
    raw = strip_numeric(raw)
    raw = ViTokenizer.tokenize(raw)
    return raw
def raw_text_preprocess(d):
    d = re.sub(r"http\S+", "", d)
    d = strip_non_alphanum(d).lower().strip()
    d = split_alphanum(d)
    d = strip_short(d, minsize=2)
    d = strip_numeric(d)
    d = ViTokenizer.tokenize(d)
    return d
Example #16
0
def clean(sx):
    sx = strip_tags(sx)
    sx = strip_numeric(sx)
    sx = re.sub(r'\n', ' ', sx)
    sx = re.sub(r'\[', '', sx)
    sx = re.sub(r'\]', '', sx)
    sx = strip_multiple_whitespaces(sx)
    return sx
Example #17
0
def _normalize(s):
    s = s.lower()

    for k, v in contractions.items():
        s.replace(k, v)

    return strip_multiple_whitespaces(
        strip_non_alphanum(strip_numeric(strip_punctuation(
            strip_tags(s))))).split()
Example #18
0
def preprocessing(corpus):
    for document in corpus:
        doc = strip_numeric(document)
        doc = remove_stopwords(doc)
        doc = strip_short(doc, 3)
        #doc = stem_text(doc)
        doc = strip_punctuation(doc)
        strip_tags(doc)
        yield gensim.utils.tokenize(doc, lower=True)
Example #19
0
def topWords(billText, numberWords):

    text = remove_stopwords(billText)
    text = strip_numeric(text)
    text = strip_short(text, minsize=2)
    words = re.findall(r'\w+', text)
    topW = collections.Counter(words).most_common(numberWords)

    return topW
Example #20
0
def gen_wf(text):
    words = nltk.tokenize.word_tokenize(strip_numeric(remove_stopwords(text)))

    resultwords = [
        word.lower() for word in words
        if ((word.lower() not in stopwords) and (len(word) > 2))
    ]

    fdist = nltk.FreqDist(resultwords)
    return dict(fdist)
Example #21
0
def preprocess_mail(mail):
    mail = re.sub("https\S+", "", mail)  # Loại bỏ các đường dẫn
    mail = strip_non_alphanum(mail).lower().strip(
    )  # Loại bỏ các kí tự không phải là chữ cái, chuyển tất cả kí tự thành chữ thường
    mail = split_alphanum(mail)  # Tách văn bản thành các từ
    mail = strip_short(
        mail, minsize=2
    )  # Lấy các từ có độ dài >= 2 kí tự, loại bỏ các từ có 1 chữ cái
    mail = strip_numeric(mail)  #
    mail = ViTokenizer.tokenize(mail)
    return mail
def preprocess_text(corpus,field_name = 'Comment'):
    print("Preprocessing Corpus from pandas data frame")
    for index, row in corpus.iterrows():  #iterate through rows in dataframe
        line = row['Comment'].strip('\n')
        line = strip_punctuation(line)
        line = strip_non_alphanum(line)
        line = strip_numeric(line)
        line = strip_multiple_whitespaces(line)
        line = strip_short(line)
        #add cleaned text line to new dataframe
        corpus.at[index,field_name] = line #set value at row/column in corpus dataframet            
    return corpus
Example #23
0
    def preprocess(self, data):

        data = [s.lower() for s in data]

        data = [parser.remove_stopwords(s) for s in data]
        data = [parser.strip_numeric(s) for s in data]
        data = [tokenizer.tokenize(s) for s in data]

        data = [[token for token in doc if len(token) > 1] for doc in data]
        data = [[lemmatizer.lemmatize(word) for word in doc] for doc in data]

        return data
Example #24
0
def preprocess_text(text):
    text = parse_html_v2(text)
    text = text.lower()
    text = remove_links_content(text)
    text = remove_emails(text)
    text = remove_special_tags(text)  # remove content between {}
    text = remove_punctuation(text)  # remove all puntuations
    text = split_alphanum(text)  # add space between word and numeric
    text = strip_numeric(text)  # remove digits
    text = strip_non_alphanum(text)  # remove non-alphabetic characters
    text = strip_short(text, minsize=2)  # remove word with length < minsize
    text = remove_multiple_space(text).strip()  # remove space and strip
    text = ViTokenizer.tokenize(text)
    return text
Example #25
0
def import_data(file, row_content, x):
    content = []
    label = []
    content_1 = open(file, 'r')
    csv_reader = csv.reader(content_1)
    for row in csv_reader:
        row_new = remove_stopwords(row[row_content])
        row_new = strip_numeric(row_new)
        #row_new = strip_non_alphanum(row_new)
        row_new = strip_short(row_new, minsize=3)
        content.append(row_new)
    length = len(content)
    for i in range(0, length):
        label.append(x)

    return content, label
Example #26
0
def remove_non_plain(document):
    """
    Replaces urls, @usernames, #tags, emojis and numbers
    with a ' ' (space). Also removes accents and punctuation
    to finally remove redundant whitespace and lowercase all
    characters
    :param document: string
    :return: processed unicode string
    """
    document = to_unicode(document)
    document = non_plain_re.sub(' ', document)
    document = proc.strip_non_alphanum(document)
    document = proc.strip_numeric(document)
    document = proc.strip_multiple_whitespaces(document)
    document = deaccent(document)
    return document.lower()
Example #27
0
def clean_text(x: str) -> str:
    """
    :param x: raw string
    :return x: cleaned string
    """

    x = x.lower()
    x = re.sub('ssense|exclusive', '', x)

    x = strip_non_alphanum(x)
    x = strip_numeric(x)
    x = strip_short(x, minsize=2)
    x = remove_stopwords(x)
    x = strip_punctuation(x)
    x = strip_multiple_whitespaces(x)

    return x
Example #28
0
def read_data(data_source_path):
    corpus = []
    for filename in os.listdir(data_source_path):
        if filename.endswith(".zip"):
            filename = os.path.join(data_source_path, filename)
            """Extract the first file enclosed in a zip file as a list of words."""
            with zipfile.ZipFile(filename) as f:
                data = preprocessing.remove_stopwords(
                    f.read(f.namelist()[0]).lower())
                data = preprocessing.strip_multiple_whitespaces(data)
                data = preprocessing.strip_numeric(data)
                #data = preprocessing.split_alphanum(data)
                #data = f.read(f.namelist()[0])
                data = tf.compat.as_str(data).split()
                #data = preprocessing(data)
                corpus.append(data)
    return corpus
    def prep_text_czech(self, text):
        res = preprocessing.strip_punctuation(text.lower())
        if self.settings['strip_nums']:
            res = preprocessing.strip_numeric(res)

        if self.settings['use_lemmatizer']:
            res = " ".join(
                [czech_lemmatizer.lemmatize(word) for word in res.split()])

        res = " ".join(
            [word for word in res.split() if word not in cz_stopwords])

        if self.settings['strip_short']:
            res = preprocessing.strip_short(res, minsize=3)
        if self.settings['use_stemmer']:
            res = " ".join(
                [czech_stemmer.cz_stem(word) for word in res.split()])
        return res
Example #30
0
def wordcount(text):
    '''Calculate post length after removing http addresses, 
       numbers and multiple whitespaces

    input: 
        text: a string
    returns: 
        the adjusted wordcount.
    '''
    text = preprocess.strip_numeric(text) #get rid of numbers
    p = re.compile(r'(http.*\s)|(http.*$)')
    text = p.sub('',text)
    p = re.compile(r'[^\x00-\x7F]+')
    text = p.sub('',text)
    text = preprocess.strip_multiple_whitespaces(text)
    words = text.split()
    count = len(words)
    return count
Example #31
0
 def testStripNumeric(self):
     self.assertEqual(strip_numeric("salut les amis du 59"), "salut les amis du ")