Beispiel #1
0
 def analysis_word(self, word):
     if langid.classify(word)[0] == 'ja':
         return 'jp'
     elif langid.classify(word)[0] == 'zh':
         return 'jp'
     elif langid.classify(word)[0] == 'en':
         return 'en'
     else:
         return 'en'
def parse_data(data):
    """
    Parse all unique sentences in data.
    
    :param data: pandas.DataFrame with text data
    :returns parsed_data:: pandas.DataFrame with text data
    """
    parser_en = spacy.load('en_core_web_md', disable=['ner', 'textcat'])
    parser_es = spacy.load('es_core_news_sm', disable=['ner', 'textcat'])
    # custom tokenizers because duh
    parser_en.tokenizer = NLTKTokenizerSpacy(parser_en.vocab, TweetTokenizer())
    parser_es.tokenizer = NLTKTokenizerSpacy(parser_es.vocab, ToktokTokenizer())
    data.loc[:, 'lang'] = data.loc[:, 'txt'].apply(lambda x: langid.classify(x)[0])
    parsed_data = []
    for i, data_i in data.iterrows():
        txt = data_i.loc['txt']
        txt = clean_data_for_spacy(txt)
        sents = sent_tokenize(txt)
        parsed_data_i = []
        for sent in sents:
            if(data_i.loc['lang'] == 'es'):
                parse_i = parser_es(sent)
            else:
                parse_i = parser_en(sent)
            # extract tree
            tree_i = build_parse(parse_i, parse_type='spacy')
            parsed_data_i.append(tree_i)
        parsed_data_i = pd.DataFrame(pd.Series(parsed_data_i), columns=['parse'])
#         logging.debug('processing id %s/%s'%(data_i.loc['id'], int(data_i.loc['id'])))
        parsed_data_i = parsed_data_i.assign(**{'id' : int(data_i.loc['id'])})
        parsed_data.append(parsed_data_i)
    parsed_data = pd.concat(parsed_data, axis=0)
#     parsed_data.loc[:, 'id'] = parsed_data.loc[:, 'id'].astype(np.int64)
    return parsed_data
Beispiel #3
0
    def translate_with_pronounce(self,word):
        self.init(word)
        lang = langid.classify(word)[0]
        t = None
        if lang == 'en':
            self.lang = 'en'
        elif lang == 'ja' or lang == 'zh':
            self.lang = 'jp'
            h = HujiangTranslation()
            h.init(word)
            t = h.get_data()
        else:
            self.lang = 'en'
        self.parse_strategy.lang = self.lang
        self.data = {
            'from': self.lang,
            'to': 'zh',
            # 'source':'txt',
            'query': self.url.request_key_param
        }
        if self.lang != 'zh':

            if t:
                return (self.get_data(), t)
            return (self.get_data(),'')
        return ('zh','')
Beispiel #4
0
def detect_language(text, threshold=0.9):
    classif = langid.classify(text)
    if not classif:
        return u'und'

    if classif[1] >= threshold:
        return classif[0]

    return 'und'
Beispiel #5
0
def detect_language(text):
    classif = langid.classify(text)
    if not classif:
        return 'UNK'

    if classif[1] > 0.9:
        return classif[0]

    return 'UNK'
Beispiel #6
0
def detect_language(text):
    classif = langid.classify(text)
    if not classif:
        return 'UNK'

    if classif[1] > 0.9:
        return classif[0]

    return 'UNK'
def is_english(str):
    '''
    返回英语的评论
    :param str:
    :return:
    '''
    if str == None:
        return None
    if langid.classify(str)[0] == 'en':  # 去掉非英文评论
        return str
    else:
        return None
def gen_output(data, json_data_dir):

    term, is_reply, tweets_needed = data

    dataset = []

    # get all user files
    files = glob.glob(os.path.join(json_data_dir, "*"))
    random.shuffle(files)

    for f in files:
        user = TwitterUser()
        user.populate_tweets_from_file(f,
                                       store_json=True,
                                       do_arabic_stemming=False,
                                       lemmatize=False)

        if 50 <= user.n_total_tweets <= 10000 and\
           user.followers_count <= 25000 and user.creation_date <= MIN_ACCOUNT_AGE:

            tweet_set = [t for t in user.tweets if t.retweeted is None and\
                           len(t.urls) == 0 and 'http:' not in t.text and\
                           len(t.tokens) > 5 and\
                           t.created_at >= MIN_TWEET_DATE and\
                           (term == '' or term in t.tokens) and\
                           langid.classify(t.text)[0] == 'en'and\
                           sentiment(t.text)['compound'] != 0]

            if is_reply:
                tweet_set = [t for t in tweet_set if t.reply_to]
            else:
                tweet_set = [t for t in tweet_set if not t.reply_to]

            if len(tweet_set) == 0:
                print 'size 0', term, tweets_needed, is_reply
                continue

            tweet = random.sample(tweet_set, 1)[0]
            print user.screen_name, term, tweets_needed, is_reply, "::::  ", tweet.text
            dataset.append(tweet)
            tweets_needed -= 1
            if tweets_needed == 0:
                name = term if term != '' else 'random'
                name += '_reply' if is_reply else '_non_reply'
                pickle.dump(dataset, open(name + ".p", 'wb'))
                print 'done with: ', name, is_reply
                return

        else:
            print 'failed user'
def write_status_txt(tweet_file, out_file_status, out_file_txt, langs=['en']):
    """
    Write status info and text to file.
    
    :param tweet_file: tweet input file
    :param out_file_status: status output file
    :param out_file_txt: text output file
    """
    status_idx = [x for x in X_KEYS if x != 'text']
    with gzip.open(out_file_status, 'wb') as out_file_status_output, open(out_file_txt, 'wb') as out_file_txt_output:
        with gzip.open(tweet_file, 'r') as tweet_file_input:
#             try:
            for i, x in enumerate(tweet_file_input):
#             for archive_file in archive_dir.filelist:
#                 for i, x in enumerate(archive_dir.open(archive_file)):
                # check for JSON
                if(i == 0):
                    x_str = x.decode('utf-8').strip()
                    try:
                        x_test = load_json(x_str)
#                         x_test = literal_eval(x_str)
#                         x_test = json.loads(x_str)
#                             json.loads(x.decode('utf-8').strip())
                        input_file_json = type(x_test) is dict
                    except Exception as e:
                        logging.debug('json error %s with tweet %s'%(e, str(x)))
                        input_file_json = False
                # check for dummy .tsv first line
                if(not input_file_json and x.decode('utf-8').split('\t')[0]=='username'):
                    continue
                else:
                    x_data = process_line(x, input_file_json)
                    x_data_status = x_data.loc[status_idx].values
                    x_data_txt = clean_txt(x_data.loc['text'])
                    if(x_data_txt == ''):
                        logging.debug('empty text at status %s'%(x))
                    ## TODO: include original status text in status data file
                    ## only tag valid statuses
                    if(x_data_txt != ''):
                        ## filter for language
                        x_lang = langid.classify(x_data_txt)
                        if(x_lang[0] in langs):
                            x_data_status = np.append(x_data_status, [x_lang[0]])
                            x_data_status_str = [str(y) for y in x_data_status]
                            out_file_status_output.write(('%s\n'%('\t'.join(x_data_status_str))).encode('utf-8'))
                            out_file_txt_output.write(('%s\n'%(x_data_txt)).encode('utf-8'))
                if(i % 1000 == 0):
                    logging.debug('processed %d tweets'%(i))
def gen_output(data, json_data_dir):

    term,is_reply,tweets_needed = data

    dataset = []

    # get all user files
    files = glob.glob(os.path.join(json_data_dir,"*"))
    random.shuffle(files)

    for f in files:
        user = TwitterUser()
        user.populate_tweets_from_file(f,store_json=True,do_arabic_stemming=False,lemmatize=False)

        if 50 <= user.n_total_tweets <= 10000 and\
           user.followers_count <= 25000 and user.creation_date <= MIN_ACCOUNT_AGE:

            tweet_set = [t for t in user.tweets if t.retweeted is None and\
                           len(t.urls) == 0 and 'http:' not in t.text and\
                           len(t.tokens) > 5 and\
                           t.created_at >= MIN_TWEET_DATE and\
                           (term == '' or term in t.tokens) and\
                           langid.classify(t.text)[0] == 'en'and\
                           sentiment(t.text)['compound'] != 0]

            if is_reply:
                tweet_set = [t for t in tweet_set if t.reply_to]
            else:
                tweet_set = [t for t in tweet_set if not t.reply_to]

            if len(tweet_set) == 0:
                print 'size 0', term, tweets_needed, is_reply
                continue

            tweet = random.sample(tweet_set, 1)[0]
            print user.screen_name, term, tweets_needed, is_reply, "::::  ", tweet.text
            dataset.append(tweet)
            tweets_needed -= 1
            if tweets_needed == 0:
                name = term if term != '' else 'random'
                name += '_reply' if is_reply else '_non_reply'
                pickle.dump(dataset,open(name+".p",'wb'))
                print 'done with: ',name, is_reply
                return

        else:
            print 'failed user'
def prepare_data():
    reviews: List[Reviews] = load_reviews('rozetka.csv')
    print(f'Total records: {len(reviews)}')
    filtered: List[Reviews] = []
    for r in reviews:
        if len(r.review) < 32:
            continue  # too short
        lang = langid.classify(r.review)
        if lang[0] != 'uk':
            continue  # not ukrainian
        filtered.append(r)
    print(f'Filtered records: {len(filtered)}')
    shuffle(filtered)
    threshold = int(len(filtered) * 0.7)
    learn = filtered[:threshold]
    print(f"Learn records set: {len(learn)}")
    test = filtered[threshold:]
    print(f"Test records set: {len(test)}")
    save_reviews('rozetka_learn.csv', learn)
    save_reviews('rozetka_test.csv', test)
def parse_data_tweebo(data, tmp_dir='../../data/mined_tweets/'):
    """
    Parse all unique sentences in data,
    using TweeboParser for English and 
    spacy for Spanish. Assumes that TweeboParser
    has already been installed.
    TODO: extract from ugly CoNLL output
    
    :param data: pandas.DataFrame with text data
    :returns parsed_data:: pandas.DataFrame with text data
    """
    ## need to write English data to file to get TweeboParser to run...FML 
    parser_es = spacy.load('es_core_news_sm', disable=['tagger', 'ner', 'textcat'])
    data.loc[:, 'lang'] = data.loc[:, 'txt'].apply(lambda x: langid.classify(x)[0])
    
    data_es = data[data.loc[:, 'lang'] == 'es']
    data_en = data[data.loc[:, 'lang'] != 'es']
    
    ## EN parses
    # write to temp file
    en_txt_file = os.path.join(tmp_dir, 'en_parse_tmp.txt')
    with open(en_txt_file, 'w') as en_txt_out:
        # clean text
        txt_clean = data_en.loc[:, 'txt'].apply(lambda x: clean_parse_txt(x)).values
        en_txt_out.write('\n'.join(txt_clean))
    # parse
    en_txt_parse_file = '%s.predict'%(en_txt_file)
    parse_command = 'cd TweeboParser && ./run.sh %s'%(en_txt_file)
    process = subprocess.Popen(parse_command.split(), stdout=subprocess.PIPE)
    output, err = process.communicate()
    # read parses
    parsed_data_en = read_tweeboparse_output(en_txt_parse_file)
    # remove files
    os.remove(en_txt_file)
    os.remove(en_txt_parse_file)
    
    ## ES parses
    parsed_data_es = data_es.loc[:, 'txt'].apply(lambda x: [build_parse(parser_es(y), parse_type='spacy') for y in sent_tokenize(x)])
Beispiel #13
0
from langid import langid

if __name__ == '__main__':
    with open("Electronics.txt", "r", encoding="utf-8") as readFile:
        with open("electronics_filtered.csv", "w", encoding="utf-8", newline="") as writeFile:
            csvWriter = csv.writer(writeFile)
            currRow = ["", ""]
            for line in readFile.readlines():
                if line[-1] == "\n":
                    line = line[:-1]
                if line.startswith("review/score"):
                    currRow[0] = line[13:].strip()
                if line.startswith("review/text"):
                    currRow[1] = line[12:].strip()
                    csvWriter.writerow(currRow)
                    currRow = ["", ""]

    with open("electronics_filtered.csv", "r", encoding="utf-8") as filteredReadFile:
        with open("electronics_filtered_and_smaller.csv", "w", encoding="utf-8", newline="") as filteredWriteFile:
            csvReader = csv.reader(filteredReadFile)
            csvWriter = csv.writer(filteredWriteFile)
            for row in csvReader:
                if row[0] == 3.0:
                    continue
                language, score = langid.classify(row[1])
                newRow = ["", "", ""]
                newRow[0] = language
                newRow[1] = row[0]
                newRow[2] = row[1]
                csvWriter.writerow(newRow)
curr_dataset = datasets_to_collect[0]

print datasets_to_collect

for f in files:
    user = TwitterUser(filename_for_tweets=f)

    if user.n_total_tweets < 10000 and user.n_total_tweets > 50 and\
        user.followers_count < 25000 and user.creation_date <= MIN_ACCOUNT_AGE:

        tweet_set = [t for t in user.tweets if t.retweeted is None and\
                                                len(t.urls) == 0 and\
                                                len(t.tokens) > 5 and\
                                                t.created_at <= MIN_TWEET_DATE and\
                                                curr_dataset[0] in t.tokens and\
                                                langid.classify(t.text)[0] == 'en'and\
                                                sentiment(t.text)['compound'] != 0]
        if len(tweet_set) == 0:
            continue

        tweet = random.sample(tweet_set, 1)[0]
        print user.screen_name, curr_dataset[0:2], "::::  ", tweet.text
        curr_dataset[2].append(tweet)
        curr_dataset[1] -= 1
        if curr_dataset[1] == 0:
            pickle.dump(curr_dataset[2], open(curr_dataset[0] + ".p", 'wb'))
            if len(datasets_to_collect) == 1:
                print 'DONE!!!'
                break
            datasets_to_collect = datasets_to_collect[1:]
            curr_dataset = datasets_to_collect[0]
Beispiel #15
0
from langid import langid

s1 = '中文'
s2 = 'contenders'
s3= 'こにちは'
s4 = '規定'
s5 = '上げる'
s6 = '寝る'
l=[]
l.append(langid.classify(s1))
l.append(langid.classify(s2))
l.append(langid.classify(s3))
l.append(langid.classify(s4))
l.append(langid.classify(s5))
l.append(langid.classify(s6))
print('https://sp1.baidu.com/5b11fzupBgM18t7jm9iCKT-xh_/sensearch/selecttext?cb=jQuery110205234840516971939_1540970393811&q=content')
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--top_communities',
        default='../data/community_stats/2015_2016_top_500_communities.txt')
    parser.add_argument('--years', default='2015')
    parser.add_argument('--sample_size', default=100)
    args = parser.parse_args()
    top_communities = [
        l.lower().strip() for l in open(args.top_communities, 'r')
    ]
    sample_size = args.sample_size
    years = args.years.split(',')
    lang_ctr = {c: defaultdict(float) for c in top_communities}
    es = Elasticsearch(timeout=TIMEOUT)
    # split over 2 years ;_;
    for i, c in enumerate(top_communities):
        print('querying community #%d = %s' % (i, c))
        # collect sample
        community_post_sample = []
        for y in years:
            # index = 'reddit_comments-%s'%(y)
            index = 'reddit_comments-%s-2' % (y)
            # query = {"query": {"match": {"subreddit": c}}}
            query = {
                "query": {
                    "constant_score": {
                        "filter": {
                            "term": {
                                "subreddit": c
                            }
                        }
                    }
                }
            }
            results = helpers.scan(es, query=query, index=index)
            # reservoir sample
            ctr = 0
            for r in results:
                text = r['_source']['body']
                if (text != '[deleted]'):
                    # print(text)
                    # if smaller than sample, add to sample
                    if (len(community_post_sample) < sample_size):
                        community_post_sample.append(text)
                    # otherwise probabilistically replace
                    elif (random() <= REPLACE):
                        replace_index = randint(0, sample_size - 1)
                        community_post_sample[replace_index] = text
                    ctr += 1
                    if (ctr % 100000 == 0):
                        print('processed %d valid comments' % (ctr))
        # convert sample to languages
        for t in community_post_sample:
            language, confidence = classify(t)
            lang_ctr[c][language] += 1
        # normalize!
        for k in lang_ctr[c].keys():
            lang_ctr[c][k] /= sample_size
    # now write to file
    lang_ctr = pd.DataFrame(lang_ctr)
    out_dir = os.path.dirname(args.top_communities)
    out_file = os.path.join(out_dir, 'community_lang_sample.tsv')
    lang_ctr.to_csv(out_file, sep='\t')
curr_dataset = datasets_to_collect[0]

print datasets_to_collect

for f in files:
    user = TwitterUser(filename_for_tweets=f)

    if user.n_total_tweets < 10000 and user.n_total_tweets > 50 and\
        user.followers_count < 25000 and user.creation_date <= MIN_ACCOUNT_AGE:

        tweet_set = [t for t in user.tweets if t.retweeted is None and\
                                                len(t.urls) == 0 and\
                                                len(t.tokens) > 5 and\
                                                t.created_at <= MIN_TWEET_DATE and\
                                                curr_dataset[0] in t.tokens and\
                                                langid.classify(t.text)[0] == 'en'and\
                                                sentiment(t.text)['compound'] != 0]
        if len(tweet_set) == 0:
            continue

        tweet = random.sample(tweet_set, 1)[0]
        print user.screen_name, curr_dataset[0:2], "::::  ", tweet.text
        curr_dataset[2].append(tweet)
        curr_dataset[1] -= 1
        if curr_dataset[1] == 0:
            pickle.dump(curr_dataset[2],open(curr_dataset[0]+".p",'wb'))
            if len(datasets_to_collect) == 1:
                print 'DONE!!!'
                break
            datasets_to_collect = datasets_to_collect[1:]
            curr_dataset = datasets_to_collect[0]
def extract_langid(sentence):
    ensure_package_path()
    from langid import langid
    message = sentence['Sentence']
    sentence['Language'] = langid.classify(message)[0]
    return sentence
Beispiel #19
0
def alias_by_str(str):
    return langid.classify(str)[0].upper()
Beispiel #20
0
    def closed_pull_requests_summary(self):
        pulls_file = self.folder + '/pull_requests.json'
        pulls_summary_file = self.folder + '/pulls_closed.csv'

        if os.path.isfile(
                pulls_file) and not os.path.isfile(pulls_summary_file):
            with open(pulls_file, 'r') as pulls:
                data = json.load(pulls)

                with open(pulls_summary_file, 'a') as output:
                    fieldnames = [
                        'pull_request', 'number_of_commits',
                        'number_of_comments', 'number_of_reviews', 'user_type',
                        'user_login', 'closed_at', 'number_of_additions',
                        'number_of_deletions', 'number_of_files_changed',
                        'number_of_days', 'message', 'number_of_characters',
                        'second_line_is_blank', 'language'
                    ]
                    writer = csv.DictWriter(output, fieldnames=fieldnames)
                    writer.writeheader()

                    for pull_request in data:
                        if pull_request['state'] == 'closed' and pull_request[
                                'merged_at'] == None:
                            try:
                                number_of_commits = self.collector.commits_in_pull_request(
                                    pull_request['number'])
                                number_of_comments = self.collector.comments_in_pull_request(
                                    pull_request['number'])
                                number_of_reviews = self.collector.reviews_in_pull_request(
                                    pull_request['number'])
                                pull_request_data = self.collector.pull_request(
                                    pull_request['number'])

                                number_of_files_changed = None
                                number_of_additions = None
                                number_of_deletions = None
                                message = ''

                                if pull_request_data:
                                    if 'changed_files' in pull_request_data:
                                        number_of_files_changed = pull_request_data[
                                            'changed_files']
                                    if 'additions' in pull_request_data:
                                        number_of_additions = pull_request_data[
                                            'additions']
                                    if 'deletions' in pull_request_data:
                                        number_of_deletions = pull_request_data[
                                            'deletions']
                                    if 'body' in pull_request_data:
                                        if pull_request_data['body'] != None:
                                            message = pull_request_data[
                                                'body'].encode('utf-8')

                                    number_of_characters = len(message)
                                    second_line_is_blank = False
                                    lines = message.split('\n')

                                    if len(lines) > 1:
                                        if not lines[1].strip():
                                            second_line_is_blank = True

                                    language = langid.classify(message)[0]

                                created_at = datetime.strptime(
                                    pull_request['created_at'],
                                    '%Y-%m-%dT%H:%M:%SZ')
                                closed_at = datetime.strptime(
                                    pull_request['created_at'],
                                    '%Y-%m-%dT%H:%M:%SZ')
                                number_of_days = (closed_at - created_at).days

                                if pull_request['user']['site_admin'] == True:
                                    writer.writerow({
                                        'number_of_characters':
                                        number_of_characters,
                                        'second_line_is_blank':
                                        second_line_is_blank,
                                        'language':
                                        language,
                                        'pull_request':
                                        pull_request['number'],
                                        'number_of_commits':
                                        len(number_of_commits),
                                        'number_of_comments':
                                        len(number_of_comments),
                                        'number_of_reviews':
                                        len(number_of_reviews),
                                        'user_type':
                                        'Internals',
                                        'user_login':
                                        pull_request['user']['login'],
                                        'closed_at':
                                        closed_at,
                                        'number_of_additions':
                                        number_of_additions,
                                        'number_of_deletions':
                                        number_of_deletions,
                                        'number_of_files_changed':
                                        number_of_files_changed,
                                        'number_of_days':
                                        number_of_days,
                                        'message':
                                        message
                                    })
                                else:
                                    writer.writerow({
                                        'number_of_characters':
                                        number_of_characters,
                                        'second_line_is_blank':
                                        second_line_is_blank,
                                        'language':
                                        language,
                                        'pull_request':
                                        pull_request['number'],
                                        'number_of_commits':
                                        len(number_of_commits),
                                        'number_of_comments':
                                        len(number_of_comments),
                                        'number_of_reviews':
                                        len(number_of_reviews),
                                        'user_type':
                                        'Externals',
                                        'user_login':
                                        pull_request['user']['login'],
                                        'closed_at':
                                        closed_at,
                                        'number_of_additions':
                                        number_of_additions,
                                        'number_of_deletions':
                                        number_of_deletions,
                                        'number_of_files_changed':
                                        number_of_files_changed,
                                        'number_of_days':
                                        number_of_days,
                                        'message':
                                        message
                                    })
                            except Exception as ex:
                                with open('error.log', 'a') as errors:
                                    errors.write(ex)
                                    errors.write('\n Repository:' +
                                                 self.folder + '\n')
Beispiel #21
0
    def preprocess(self, path_data_full, path_data_processed, data_range):
        all_quotes = []
        words_hist = []
        glove = GloVe('6B')
        all_tokens = list(glove.token_to_index.keys())
        # Different punctuations
        punct_dict = {
            '.': '',
            '!': ' ',
            '?': ' ',
            '...': ' ',
            ',': ' ',
            ';': ' ',
            ':': ' ',
            '\u201D': ' ',
            '\u2019\u2019': ' ',
            ' \'': ' ',
            '\' ': ' ',
            '"': ' ',
            '--': ' ',
            '-': ' ',
            '\u201C': '',
            '\u2019': ' ',
            '\u2026': ' ',
            '(': ' ',
            ')': ' ',
            '[': ' ',
            ']': ' ',
            '{': ' ',
            '}': ' ',
            '\u2014': ' ',
            '+': ' ',
            '„': ' ',
            '[]': ' ',
            '()': ' ',
            '{}': ' ',
            '=': ' ',
            '♕': ' ',
            '@': ' ',
            '*': ' ',
            '&': ' and ',
            '#': ' ',
            '~': ' ',
            '\u2E2E': ' ',
            '\u2640': ' ',
            '\\': ' ',
            '/': ' ',
            '\u2665': ' ',
            '\u2764': ' ',
            '\u2018': ' ',
            '\u265B': ' ',
            '\u262F': ' ',
            '\u2013': ' ',
            '\uFF07': ' ',
            '\uFF07\uFF07': ' ',
            '\uFF40': ' ',
            '\u02CB': ' ',
            '\u0300': ' ',
            '%': ' %',
            '\u02BC': ' ',
            '\u02BC\u02BC': ' ',
            'ღ': ' ',
            '\u2500': ' ',
            '\u202c': ' ',
            '\u0301': ' ',
            '\u202A': ' ',
            '<': ' ',
            '>': ' ',
            '❞': ' ',
            'ε': ' ',
            '\u2637': ' ',
            '↺': ' ',
            '®': ' ',
            '$': ' ',
            '❣': ' ',
            '\u2015': ' ',
            '\u0313': ' ',
            '\u201B': ' ',
            '\u2032': ' ',
            '\u05F3': ' ',
            '\'': ' ',
            '`': ' ',
            '\u200E': ' ',
            '  ': ' ',
            '   ': ' ',
            '    ': ' ',
        }
        # Word combinations and "shortcuts"
        short_dict = {
            'i\'m ': ' i am ',
            'i\u0301m ': ' i am ',
            'i\u2019m ': ' i am ',
            'it\'s ': ' it is ',
            'it\u2019s ': ' it is ',
            'it´s ': ' it is ',
            '\u00B4ll': ' will ',
            'won\u00b4t ': ' will not ',
            '\u00B4re ': ' are ',
            '\u00B4ve ': ' have ',
            'i\u00B4m ': ' i am ',
            ' won\'t ': ' will not ',
            'i\u0060m ': ' i am ',
            'man\'s ': ' mans ',
            'won\u2019t ': ' will not ',
            'can\'t ': ' cannot ',
            '\'re ': ' are ',
            'can\u0060t ': ' cannot ',
            '\u0060ve ': ' have ',
            'won\u0060t ': ' will not ',
            'n\u0060t ': ' not ',
            '\u02B9s ': ' is ',
            '\u0374s ': ' is ',
            '\u0374ve ': ' have ',
            '\u0374re ': ' are ',
            '\u02B9ve ': ' have ',
            '\u02B9re ': ' are ',
            '\'ve ': ' have ',
            '\'ll ': ' will ',
            '\u0060ll ': ' will ',
            '\'d ': ' would ',
            'n\'t ': ' not ',
            '\'s ': ' is ',
            'don\u2019t ': ' do not ',
            'me\u2026 ': ' me ',
            '\u2019s ': ' is ',
            '\u2019re ': ' are ',
            '\u0060re ': ' are ',
            'if\u2026 ': ' if ',
            'day\u2026 ': ' day ',
            'n\u2019t ': ' not ',
            '\u2019ll ': ' will ',
            '\u2019d ': ' would ',
            'n´t ': ' not ',
            '\u0301re ': ' are ',
            '\u0301ve ': ' have ',
            '̵͇̿̿з ': ' ',
            '•̪ⓧ ': ' ',
            '̵͇̿̿ ': ' ',
            'isno ': 'is no ',
            'kissand ': 'kiss and',
            'ryanlilly ': 'ryan lilly ',
            'meand ': 'me and',
            'whatlooks ': 'what looks',
            'girlfriendcut ': 'girlfriend cut',
            'worldyou ': ' world you ',
            'heavenis ': ' heaven is ',
            'worldso ': ' world so ',
            'havebetter ': ' have better ',
            'unknownand ': ' unknown and ',
            ' allof ': ' all of ',
            ' tolook ': ' to look ',
            ' notaffect ': ' not affect ',
            'likea ': ' like a ',
            'wantedas ': ' wanted as ',
            'agonyof ': ' agony of ',
            'skillthat ': ' skill that ',
            'worldsall ': ' worlds all ',
            'awaywhat ': ' away what ',
            'outwhat ': ' out what ',
            'savewhat ': ' save what ',
            'educationso ': ' education so ',
            'anyday ': ' any day ',
            'usdo ': ' us do ',
            ' dependsupona ': ' depends upon a',
            ' wheelbarrowglazed ': ' wheelbarrow glazed ',
            'waterbeside': 'water beside',
            ' whitechickens ': ' white chickens ',
            ' ain\'t ': ' aint ',
        }
        with open(path_data_full, encoding='utf8') as json_file:
            data = json.load(json_file)
            if data_range == 0:
                data_range = len(data)
            for i in range(data_range):
                quote = data[i].get('Quote')

                # Detect language - if not english, then skip over
                # langID
                check_ld = langid.classify(quote)
                if check_ld[0] != 'en':
                    continue

                # Cleansing of quotes
                #quote = re.sub(r"(?<![A-Z])(?<!^)([A-Z])", r" \1", quote)
                quote = re.sub(r"\.(?!\s)(?!$)", r". ", quote)
                quote = quote.lower()
                quote = quote.strip('...')
                quote = quote.strip('"')
                quote = quote.strip()

                for diction in short_dict, punct_dict:
                    for inst in diction:
                        quote = quote.replace(inst, diction[inst])

                # Length control - if longer than 10 words, then f**k off mate
                enable_length_control = True
                if enable_length_control:
                    if len(quote.split()) > 20:
                        continue

                # Check if word is in tokens, if not, drop sentence:
                all_tokens_avail = True
                for word in quote.split():
                    if not word in all_tokens:
                        all_tokens_avail = False
                        break
                if not all_tokens_avail:
                    continue

                # Filter empty quotes out (need content to be used, obviously):
                if len(quote.split()) > 0:
                    all_quotes.append(quote)
                    for word in quote.split():
                        words_hist.append(word)

            self.total_word_count = len(words_hist)
            words_hist = Counter(words_hist)
            words_hist = words_hist.most_common(None)
            words, count = zip(*words_hist)
            self.word_count = dict(words_hist)
            vocabulary = dict(enumerate(words, 1))
            self.end_token = len(vocabulary) + 1
            vocabulary[f'{self.end_token}'] = self.end_token
            self.vocabulary = dict([(value, key)
                                    for key, value in vocabulary.items()])
            all_quotes = set(all_quotes)
            self.all_quotes = list(all_quotes)
            values = [' ', '', " ", ""]
            for i in self.all_quotes:
                for value in values:
                    if i == value:
                        self.all_quotes.remove(value)
                        break

            self.quote_count = len(self.all_quotes)

        json_file.close()

        # Create a list with all quotes
        processed_data = {
            'Quotes': self.all_quotes,
            'Vocabulary': self.vocabulary,
            'Word count': self.word_count,
            'Words total': self.total_word_count,
            'Quotes total': self.quote_count,
        }

        # Write to file:
        with open(path_data_processed, 'w') as outfile:
            json.dump(processed_data, outfile)

        outfile.close()
        # Delete all variables that are not needed anymore to clear memory
        del data, quote, words_hist, words, count, all_quotes, vocabulary
        # Return all needed values:
        return self.all_quotes, self.vocabulary, self.word_count,\
               self.total_word_count, self.end_token, self.quote_count,