Ejemplo n.º 1
0
def test_replace_with_desc(tweet):
    assert (
        demoji.replace_with_desc(tweet, ":") ==
        "#startspreadingthenews yankees win great start by :Santa Claus: medium-dark skin tone: going 5strong innings with 5k’s:fire: :ox:\nsolo homerun :volcano::volcano: with 2 solo homeruns and:ogre: 3run homerun… :clown face: :person rowing boat: medium-light skin tone: :man judge: medium skin tone: with rbi’s … :fire::fire:\n:flag: Mexico: and :flag: Nicaragua: to close the game:fire::fire:!!!….\nWHAT A GAME!!..\n"
    )
    assert (
        demoji.replace_with_desc(tweet, "|") ==
        "#startspreadingthenews yankees win great start by |Santa Claus: medium-dark skin tone| going 5strong innings with 5k’s|fire| |ox|\nsolo homerun |volcano||volcano| with 2 solo homeruns and|ogre| 3run homerun… |clown face| |person rowing boat: medium-light skin tone| |man judge: medium skin tone| with rbi’s … |fire||fire|\n|flag: Mexico| and |flag: Nicaragua| to close the game|fire||fire|!!!….\nWHAT A GAME!!..\n"
    )
 def _emojis(self, text, _demoji=False):
     try:
         if _demoji:
             text = demoji.replace_with_desc(text, "")
         shock_emoji = re.compile(
             "[" u"\ud83d\ude31" "]+", flags=re.UNICODE)
         text = shock_emoji.sub(r'EMONEG', text)
         # Smile -- :), : ), :-), (:, ( :, (-:, :')
         text = re.sub(r'(:\)|:-\)|\(\s:|\(-:|:\'\))', 'EMOPOS', text)
         # Laugh -- :D, : D, :-D, xD, x-D, XD, X-D, :-d, :d
         text = re.sub(r'(:D|:-D|x-D|X-D|:-d|:d)', 'EMOPOS', text)
         # Love -- <3, :*
         text = re.sub(r'(<3|:\*)', 'EMOPOS', text)
         # Wink -- ;-), ;), ;-D, ;D, (;,  (-;
         text = re.sub(r'(;-\)|;-D|\(-;)', 'EMOPOS', text)
         # Sad -- :-(, : (, :(, ):, )-:, -_-
         text = re.sub(r'(:\(|:-\(|\)\s:|\)-:|-_-)', 'EMONEG', text)
         # Cry -- :,(, :'(, :"(
         text = re.sub(r'(:,\(|:\'\(|:"\()', 'EMONEG', text)
         # Shout -- :@
         text = re.sub(r'(:\@)', 'EMONEG', text)
         text = self.__handle_coded_emojis(text)
         return text
     except Exception as e:
         print('PreProcessor Error => ', e)
         return " "
Ejemplo n.º 3
0
def preprocess_and_split_text(text):
    wordlist = []
    text_without_emojis = demoji.replace_with_desc(text, sep=" ")
    words = text_without_emojis.split()
    for word in words:
        wordlist.append(word.lower())
    return wordlist
Ejemplo n.º 4
0
 def Emoji(self, text):
     if self.kwargs['emoji'] == 'stay':
         return text
     elif self.kwargs['emoji'] == 'remove':
         text_new = re.sub(emoji.get_emoji_regexp(), '', text)
     else:
         text_new = demoji.replace_with_desc(text).replace(':', ' ')
     return text_new
Ejemplo n.º 5
0
    def run(self, event):

        file = open(self.dataEntry.get(), 'r')
        out = open('processed.txt', 'w')
        for line in file:
            out.write(demoji.replace_with_desc(line))

        self.btn2.configure(state=tk.NORMAL)
    def covert_emoji_to_text(self):
        """Convert emoji to text."""

        pre_file = self.convert_dict()
        emoji_dict = {}
        for k, v in pre_file.items():
            if isinstance(v, str) == True:
                emoji_text = demoji.replace_with_desc(v.lower())
                emoji_text = self.preprocess1(emoji_text)
                emoji_text2 = str(emoji_text).translate(
                    str.maketrans(string.punctuation,
                                  ' ' * len(string.punctuation)))
                if self.isEnglish(emoji_text2) == True:
                    emoji_dict[k] = emoji_text2

        text_df = pd.DataFrame.from_dict(emoji_dict, orient='index')
        text_df['index'] = text_df.index
        text_df.columns = ['clean_comments', 'commentId']
        text_df.to_csv(self.path + 'clean_comments.csv', encoding='utf-8-sig')
        return text_df
Ejemplo n.º 7
0
def convert(message_path):
    # Open JSON file containing downloaded twitch chat
    with open(message_path,  encoding="utf-8") as read_file:
        data = json.load(read_file)

    
    filename = outputpath + data["video"]["user_name"] + \
        " - " + data["video"]["id"] + ".txt"
    f = open(filename, "w", encoding='utf-8')
    print("Now converting: "+ filename,end="\n")

    for i in data["comments"]:
        try:
            b=known_bots.index(i["commenter"]["name"])
        except ValueError:
            filtered = demoji.replace_with_desc(i["message"]["body"],sep = ":")
            if r"@" in filtered:
                continue
            else: 
                f.write (filtered + "\n")
        else: continue
Ejemplo n.º 8
0
def translate_text(text, tgt_lang='en'):
    try:
        # Text to lower
        text = text.lower()
        # Replace Emojies to text
        text = demoji.replace_with_desc(text, "")
        # Remove extra emojies if any
        text = handle_emojis(text)
        # Clean Text to remove unwanted tokens
        text = clean_text(text)
        # translator Initiated
        translator = google_translator()
        # Text translated
        text = translator.translate(text, lang_tgt=tgt_lang)
        text = translator.translate(text, lang_tgt="en")
        # Check Auto Correct
        # spell = Speller()
        # text = spell(text)
        # returning text
        return text

    except Exception as e:
        print('Erorr => ', e)
        pass
Ejemplo n.º 9
0
def json_converter_nofilter(lang_folders, en_folders, de_folders, dest,
                            dest_cleaned):
    for lang in lang_folders:
        if lang == "De":
            folders = de_folders
        else:
            folders = en_folders

        print("Started working, checking for missing files")
        # find dates that are missing, check for one folder in source because we assume all have the same dates avaialble
        files_source = [
            k for k in os.listdir(os.path.join(path_comp, source, folders[0]))
            if ".json" in k
        ]
        # convert to datelist
        dates_source = [
            re.search(r'\d{4}-\d{2}-\d{2}', file).group()
            for file in files_source
        ]

        # check which files already exist
        files_dest = [
            k for k in os.listdir(os.path.join(path_comp, dest, folders[0]))
            if ".csv" in k
        ]
        files_dest_cleaned = [
            k for k in os.listdir(
                os.path.join(path_comp, dest_cleaned, folders[0]))
            if ".csv" in k
        ]

        # inner join list and find last date that exists in both
        files_dest_both = list(set(files_dest) & set(files_dest_cleaned))
        # convert to datelist
        dates_dest = [
            re.search(r'\d{4}-\d{2}-\d{2}', file).group()
            for file in files_dest_both
        ]

        # find files in source but not dest
        dates_missing = list(set(dates_source) - set(dates_dest))

        if len(dates_missing) == 0:
            print("No missing files found")

        # go thru all dates
        for date in dates_missing:

            # set up list
            tweets = []
            #go into each folder folders an concat tweets to df
            for folder in folders:
                print(f"Working in {date} {folder}")
                # create filename from fodler name together wit date
                filename = f"{folder}_{date}.json"
                # create path
                path1 = os.path.join(path_comp, source, folder, filename)

                # load json files
                if filename in os.listdir(
                        os.path.join(path_comp, source, folder)):
                    for line in open(path1, 'r', encoding="utf8"):
                        tweets.append(json.loads(line, parse_int=str))

            # convert to df
            df = pd.DataFrame(tweets)

            # clean dataframe
            print("Cleaning df")
            df = df_cleaner(df, lang_controller=True, lang=lang)

            new_filename_csv = f"{lang}_NoFilter_{date}.csv"

            # save df
            print("Saving data")
            df.to_csv(os.path.join(path_comp, dest, f"{lang}_NoFilter",
                                   new_filename_csv),
                      index=False)

            # now replace emojis and save in different destination
            df["tweet"] = df["tweet"].swifter.progress_bar(False).apply(
                lambda tweet: demoji.replace_with_desc(tweet, sep=" "))

            # replace _ from emojis with " "
            df.tweet = df.tweet.str.replace("_", " ")

            #save df
            df.to_csv(os.path.join(path_comp, dest_cleaned, f"{lang}_NoFilter",
                                   new_filename_csv),
                      index=False)
Ejemplo n.º 10
0
def json_converter_companies(source, dest, dest_cleaned, company_folders,
                             subfolders):
    for subfolder in subfolders:
        print(f"Working on {subfolder}")
        new_dest = os.path.join(path_comp, dest, "Companies", subfolder)
        new_dest_cleaned = os.path.join(path_comp, dest_cleaned, "Companies2",
                                        subfolder)

        # create folder in new destination if it does not already exist
        if not os.path.exists(os.path.join(path_comp, new_dest)):
            os.mkdir(os.path.join(path_comp, new_dest))
        if not os.path.exists(os.path.join(path_comp, new_dest_cleaned)):
            os.mkdir(os.path.join(path_comp, new_dest_cleaned))

        # now go into each company folder in the source an concat files from same day together
        # for this need to check if files exist in both and control for it
        files_de = os.listdir(
            os.path.join(path_comp, source, "Companies_de", f"{subfolder}_de"))
        files_en = os.listdir(
            os.path.join(path_comp, source, "Companies_en", f"{subfolder}_en"))

        # get the dates available in both datasets
        dates_de = [
            re.search(r'\d{4}-\d{2}-\d{2}', file).group() for file in files_de
        ]
        dates_en = [
            re.search(r'\d{4}-\d{2}-\d{2}', file).group() for file in files_en
        ]
        dates_both_source = list(set(dates_de) & set(dates_en))
        dates_all_source = list(set(dates_de + dates_en))

        # now check which dates already exist at dest
        files_dest = os.listdir(
            os.path.join(path_comp, dest, "Companies", subfolder))
        files_dest_cleaned = os.listdir(
            os.path.join(path_comp, dest_cleaned, "Companies2", subfolder))

        # inner join
        files_dest_both = list(set(files_dest) & set(files_dest_cleaned))

        # extract dates
        dates_exist = [
            re.search(r'\d{4}-\d{2}-\d{2}', file).group()
            for file in files_dest_both
        ]

        # find all missing dates, in case on folder has more files than redo them again because quicker than accounting for it
        # and setting up separate loop
        dates_missing = list(set(dates_all_source) - set(dates_exist))

        # find dates missing that exist in both sources
        dates_both_missing = [
            k for k in dates_missing if k in dates_both_source
        ]

        if dates_both_missing == []:
            print("No files missing that exist in german and english folders")
        else:
            print(
                f"Moving on to files that only exist in both folders for {subfolder}"
            )

        # now for each date available in both got thru both folders and concat files, then clean and save them
        for date in dates_both_missing:
            print(f"Working on {subfolder}, {date}")
            # go into englisch folder
            tweets = []
            for folder in company_folders:
                file = f"{subfolder}_{date}_{folder.split('_')[1]}.json"
                path = os.path.join(path_comp, source, folder,
                                    f"{subfolder}_{folder.split('_')[1]}",
                                    file)
                for line in open(path, 'r', encoding="utf8"):
                    tweets.append(json.loads(line, parse_int=str))

            # convert to df
            df = pd.DataFrame(tweets)

            # clean df
            df = df_cleaner(df)

            # check if df still contains entries
            if len(df) > 0:

                # save df
                new_filename_csv = f"{subfolder}_{date}.csv"

                # save df
                print("Saving data in both")

                df.to_csv(os.path.join(path_comp, new_dest, new_filename_csv),
                          index=False)

                ###########
                # now replace emojis and save in different destination
                ###########
                df["tweet"] = df["tweet"].swifter.progress_bar(False).apply(
                    lambda tweet: demoji.replace_with_desc(tweet, sep=" ")
                )  # replace _ from emojis with " "
                df.tweet = df.tweet.str.replace("_", " ")
                #save df
                df.to_csv(os.path.join(path_comp, new_dest_cleaned,
                                       new_filename_csv),
                          index=False)

        # now continue for dates not in both
        dates_de_only = list(set(dates_de) - set(dates_en))

        # find missing dates for german only
        dates_de_only_missing = [
            k for k in dates_de_only if k not in dates_exist
        ]

        if dates_de_only_missing == []:
            print("No files missing that exist in german folders only")
        else:
            print(
                f"Moving on to files that only exist in the german folder for {subfolder}"
            )

        # only clean german files
        for date in dates_de_only_missing:
            print(f"Working on {subfolder}, {date}")
            tweets = []
            file = f"{subfolder}_{date}_de.json"
            path = os.path.join(path_comp, source, "Companies_de",
                                f"{subfolder}_de", file)
            for line in open(path, 'r', encoding="utf8"):
                tweets.append(json.loads(line, parse_int=str))

            # convert to df
            df = pd.DataFrame(tweets)

            # clean df
            df = df_cleaner(df)

            # check if df still cotninas rows
            if len(df) > 0:
                # save df
                new_filename_csv = f"{subfolder}_{date}.csv"

                # save df
                print("Saving german data")
                df.to_csv(os.path.join(path_comp, new_dest, new_filename_csv),
                          index=False)

                ###########
                # now replace emojis and save in different destination
                ###########
                df["tweet"] = df["tweet"].swifter.progress_bar(False).apply(
                    lambda tweet: demoji.replace_with_desc(tweet, sep=" ")
                )  # replace _ from emojis with " "
                df.tweet = df.tweet.str.replace("_", " ")
                #save df
                df.to_csv(os.path.join(path_comp, new_dest_cleaned,
                                       new_filename_csv),
                          index=False)

        # same for englisch only
        # now continue for dates not in both
        dates_en_only = list(set(dates_en) - set(dates_de))

        # find missing
        dates_en_only_missing = [
            k for k in dates_en_only if k not in dates_exist
        ]

        if dates_en_only_missing == []:
            print("No files missing that exist in english folders only")
        else:
            print(
                f"Moving on to files that only exist in the english folder for {subfolder}"
            )
        # only clean german files
        for date in dates_en_only_missing:
            print(f"Working on {subfolder}, {date}")
            tweets = []
            file = f"{subfolder}_{date}_en.json"
            path = os.path.join(path_comp, source, "Companies_en",
                                f"{subfolder}_en", file)
            for line in open(path, 'r', encoding="utf8"):
                tweets.append(json.loads(line, parse_int=str))

            # convert to df
            df = pd.DataFrame(tweets)

            # clean df
            df = df_cleaner(df)

            if len(df) > 0:

                # save df
                new_filename_csv = f"{subfolder}_{date}.csv"

                # save df
                print("Saving english data")
                df.to_csv(os.path.join(path_comp, new_dest, new_filename_csv),
                          index=False)

                ###########
                # now replace emojis and save in different destination
                ###########
                df["tweet"] = df["tweet"].swifter.progress_bar(False).apply(
                    lambda tweet: demoji.replace_with_desc(tweet, sep=" ")
                )  # replace _ from emojis with " "
                df.tweet = df.tweet.str.replace("_", " ")
                #save df
                df.to_csv(os.path.join(path_comp, new_dest_cleaned,
                                       new_filename_csv),
                          index=False)
Ejemplo n.º 11
0
def analyze_tweets(feeling):
    tag_list = {}
    emoji_list = {}
    words[feeling] = []
    lemmatized_tweets = {}
    tk = TweetTokenizer()
    lemmatizer = WordNetLemmatizer()

    # with open(tweets_path + "dataset_dt_" + feeling.lower() + "_test_60k.txt", 'r', encoding="utf8") as file:
    with open(tweets_path + "dataset_dt_" + feeling.lower() + "_60k.txt", 'r', encoding="utf8") as file:
        lines = file.readlines()
        print("Start Analyzing tweet. Feeling: ", feeling)
        for line in tqdm(lines):

            # build map for hashtag and remove from line
            if '#' in line:
                hashtags = re.findall(r"#(\w+)", line)
                for htag in hashtags:
                    tag_list[htag] = tag_list.get(htag, 0) + 1
                    line = line.replace('#' + htag, '').replace('#', '')
                    words[feeling].append(htag)

            # find, store and replace emoji from line
            ejs = [demoji.replace_with_desc(em, ":") for em in emojiNeg + emojiPos + othersEmoji + negemoticons +
                   posemoticons if (em in line)]

            for e in ejs:
                emoji_list[e] = emoji_list.get(e, 0) + 1
                line = line.replace(e, '')
                words[feeling].append(e)

            # replace slang from sentences
            slang_list = [s for s in slang_words.keys() if (s in line.split())]
            for s in slang_list:
                line = line.replace(s, slang_words[s])

            # remove punctuation
            punct_list = [p for p in punctuation if (p in line)]
            for p in punct_list:
                line = line.replace(p, '')

            # remove USERNAME and URL
            line = line.replace('USERNAME', '').replace('URL', '').lower()

            # remove citations
            citations = re.findall(r"@(\w+)", line)
            for cit in citations:
                line = line.replace('@' + cit, '').replace('@', '')

            # tokenize sentence
            word_tokens = tk.tokenize(line)
            pos_line = pos_tagging(word_tokens)

            # lemmatize nouns, adjective, verbs
            for pos in pos_line:
                if pos[1] in ['j', 'n', 'v']:
                    lemm_w = lemmatizer.lemmatize(pos[0], pos[1])
                    words[feeling].append(lemm_w)
                    lemmatized_tweets[lemm_w] = lemmatized_tweets.get(lemm_w, 0) + 1

        # display word cloud
        wordcloud_words = WordCloud(max_font_size=50, background_color="white", width=800,
                                    height=400).generate_from_frequencies(
            lemmatized_tweets)

        wordcloud_emoji = WordCloud(max_font_size=50, background_color="white", width=800,
                                    height=400).generate_from_frequencies(
            emoji_list)
        wordcloud_tag = WordCloud(max_font_size=50, background_color="white", width=800,
                                  height=400).generate_from_frequencies(
            tag_list)
        wordcloud_words.to_file("img/cloud_words_" + feeling + ".png")
        wordcloud_emoji.to_file("img/cloud_emoji_" + feeling + ".png")
        wordcloud_tag.to_file("img/cloud_tag_" + feeling + ".png")

    # Store emoji, tags and tweets for feeling
    emoji[feeling] = emoji_list
    tweets[feeling] = lemmatized_tweets
    tags[feeling] = tag_list
Ejemplo n.º 12
0
def main(participants: List[str], args: argparse.Namespace,
         part_df: pd.DataFrame):
    now = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    dirname = ''.join(['ss_', now])
    log = not args.no_log

    if log and not os.path.isdir(dirname):
        os.makedirs(dirname)

    code_length = 3
    participants_dict = {}

    while True:
        try:
            for p in participants:
                p_wants = part_df.loc[part_df[RSN_KEY] ==
                                      p][REQUEST_KEY].values[0]
                participants_dict[p] = Person(p, dirname, args.gifts,
                                              code_length, p_wants)

            for participant_name in participants:
                P = participants_dict[participant_name]

                if not P.can_pick():
                    continue

                targets_needed = P.gifts - len(P.targets)
                potential_targets = set(
                    filter(lambda u: participants_dict[u].can_be_picked(),
                           participants)) - {P.name}
                targets = sample(potential_targets, targets_needed)

                for t in targets:
                    P.targets.append(t)
                    participants_dict[t].get_picked()

                    if args.verbose:
                        print(P.name, ">>>", t)

            break

        # Can occur with multiple gift assignments per person, creates impossible situations to resolve, just try again!
        except ValueError:
            continue

    if log:
        for participant_name in participants:
            P = participants_dict[participant_name]

            with open(P.filepath, 'w') as f:
                for t in P.targets:
                    T = participants_dict[t]
                    cohorts = []

                    for cohort_name in participants:

                        if not cohort_name == participant_name and t in participants_dict[
                                cohort_name].targets:
                            cohorts.append(cohort_name)

                    try:
                        s = ''.join([
                            'You', ' & ' * (len(cohorts) > 0),
                            ', '.join(cohorts), ' have ', t,
                            bool(T.wants) * ', who wants: ', T.wants, '\n'
                        ])
                        f.write(s)

                    except UnicodeEncodeError as E:
                        s = demoji.replace_with_desc(s)
                        f.write(s)

    if args.graph:
        plt.axes()

        # arrow properties
        margin = 1.1
        width = 0.01
        head_width = 2.5 * width
        head_length = 2 * width
        shape = 'left'
        length_includes_head = False
        squeeze = 0.1
        center_offset = 0.01
        basis_0 = np.asarray([
            0 + squeeze, 0, 1
        ])  # basis start and endpoints to transform later, in the z = 1 plane
        basis_1 = np.asarray([1 - squeeze, 0, 1])  #
        plt.xlim(-margin, margin)
        plt.ylim(-margin, margin)

        n = len(participants)
        THETA = np.linspace(
            0, 2 * math.pi,
            n + 1)[:-1]  # last value is equal to the zeroth, remove it
        X = list(map(lambda t: math.cos(t), THETA))
        Y = list(map(lambda t: math.sin(t), THETA))

        participant_coordinates = {}

        for x, y, pn in zip(X, Y, participants):
            P = participants_dict[pn]
            participant_coordinates[P.name] = (x, y)
            plt.text(x,
                     y,
                     P.code,
                     horizontalalignment='center',
                     verticalalignment='center')

        for participant_name in participants:
            P = participants_dict[participant_name]
            x0, y0 = participant_coordinates[P.name]

            for t in P.targets:
                x1, y1 = participant_coordinates[t]
                dx = x1 - x0
                dy = y1 - y0

                slope = dy / dx
                theta = math.atan(
                    slope) + (dx <= 0) * math.pi  # angle associated with line
                phi = theta - math.pi / 4  # angle associated with normal offset
                x_offset = center_offset * math.cos(phi)
                y_offset = center_offset * math.sin(phi)

                ROTATE = np.asarray([[math.cos(theta),
                                      math.sin(theta), 0],
                                     [-math.sin(theta),
                                      math.cos(theta), 0], [0, 0, 1]])

                base_length = math.sqrt(dx**2 + dy**2)
                SCALE = np.asarray([[base_length, 0, 0], [0, base_length, 0],
                                    [0, 0, 1]])

                TRANSLATION = np.asarray([[1, 0, 0], [0, 1, 0],
                                          [x0 + x_offset, y0 + y_offset, 1]])

                TRANSFORMATION = np.matmul(
                    np.matmul(ROTATE, SCALE),
                    TRANSLATION)  # full transformation in one matrix

                x0_prime, y0_prime = np.matmul(
                    basis_0, TRANSFORMATION)[:-1]  # Project back to 2D plane
                x1_prime, y1_prime = np.matmul(basis_1, TRANSFORMATION)[:-1]  #

                dx_prime = x1_prime - x0_prime
                dy_prime = y1_prime - y0_prime

                plt.arrow(x0_prime,
                          y0_prime,
                          dx_prime,
                          dy_prime,
                          head_width=head_width,
                          head_length=head_length,
                          width=width,
                          shape=shape,
                          length_includes_head=length_includes_head)

        plt.show()
Ejemplo n.º 13
0
Archivo: main.py Proyecto: DawodPaul/IA
def demoji_token(token):
    f = []
    for cuv in token:
        f.append(demoji.replace_with_desc(cuv))

    return f
Ejemplo n.º 14
0
def demojify(fp: io.IOBase):
    for line in fp:
        print(replace_with_desc(line), end="")
Ejemplo n.º 15
0
def preprocess_and_split_text(text):
    text_without_emojis = demoji.replace_with_desc(text, sep=" ")
    result = text_without_emojis.split()
    return result
Ejemplo n.º 16
0
def replace_with_desc(text):
    return demoji.replace_with_desc(text, "<emoji>")
         # save df
         new_filename_csv = f"{subfolder}_{date}.csv"
         
         # save df
         print("Saving data in both")
         
         df.to_csv(os.path.join(new_dest ,new_filename_csv),
                   index = False)
         
         
         
         
         ###########
         # now replace emojis and save in different destination
         ###########
         df["tweet"] = df["tweet"].swifter.progress_bar(False).apply(lambda tweet: demoji.replace_with_desc(tweet, 
                                                                                            sep = " "))# replace _ from emojis with " "
         df.tweet = df.tweet.str.replace("_", " ")
         #save df
         df.to_csv(os.path.join(new_dest_cleaned,new_filename_csv),
                   index = False)
 
 
 # now continue for dates not in both
 dates_de_only = list(set(dates_de) - set(dates_en))
 
 # find missing dates for german only
 dates_de_only_missing = [k for k in dates_de_only if k not in dates_exist]
 
 if dates_de_only_missing == []:
     print("No files missing that exist in german folders only")
 else:
Ejemplo n.º 18
0
def process_file(file):
    offset_time = None

    while not offset_time:
        print()
        offset_time_input = input(
            "Please input the time for " + file +
            " that the Zoom call started as hours:minutes:seconds in military/24hr time (e.g. 16:32:40): "
        )
        offset_time = re.match(r'\d{2}:\d{2}:\d{2}', offset_time_input)

    offset_time_group = offset_time.group()

    offset_time_delta = to_delta(offset_time_group + '.000')

    with open(output_folder + '/' + file[:-4] + '.vtt', 'w',
              encoding='utf-8') as vtt:
        vtt.write('WEBVTT' + '\n')

        with open(path + '/' + file, 'r', encoding='utf-8') as chat:

            storage = ''
            start_time = None
            end_time = None
            new_start_time = None

            for line in chat:
                demojied_line = demoji.replace_with_desc(line)

                split_line = demojied_line.split('\t')

                if len(split_line) > 3:
                    print(
                        'Looks like there was a tab in the text, please check the output for accuracy.'
                    )

                time = re.findall(r'\d{2}:\d{2}:\d{2}', split_line[0])

                if time:
                    if len(time) > 1:
                        print(
                            'There\'s an issue with the Zoom transcript timestamps'
                        )
                    else:
                        start_time = new_start_time

                        start_time_delta = to_delta(time[0] + '.000')
                        updated_start_delta = start_time_delta - offset_time_delta
                        str_start_delta = str(updated_start_delta)

                        if len(str_start_delta.split(':')[0]) < 2:
                            str_start_delta = '0' + str_start_delta

                        new_start_time = str_start_delta

                        if start_time:
                            updated_end_delta = updated_start_delta - timedelta(
                                milliseconds=1)
                            # updated_end_delta = end_time_delta - offset_time_delta
                            str_end_delta = str(updated_end_delta)

                            if len(str_end_delta.split(':')[0]) < 2:
                                str_end_delta = '0' + str_end_delta

                            end_time = str_end_delta

                            vtt.write('\n' + start_time + '.000 --> ' +
                                      end_time[:-3] + '\n')

                            vtt.write(storage)

                        storage = ' '.join(split_line[1:])
                else:
                    vtt.write(''.join(split_line))

            vtt.write('\n')
            end_time = datetime.time(
                datetime.strptime(new_start_time, '%H:%M:%S') +
                timedelta(milliseconds=2000))
            vtt.write(
                str(new_start_time) + '.000 --> ' + str(end_time) + '.000\n')
            vtt.write(storage)
    lambda tweet: emoji.demojize(tweet, delimiters=(" ", " ")))
time2 = time.time() - time1
print(time2)

#%%
a = df[["id", "tweet", "tweet_n"]].head(1000)

#%% with different package
import demoji

demoji.download_codes()

#%%
print(demoji.findall(text))
#%%
print(demoji.replace_with_desc(text, sep=""))
#%% take time
time1 = time.time()
df["tweet_n"] = df["tweet"].swifter.progress_bar(False).apply(
    lambda tweet: demoji.replace_with_desc(tweet, sep=" "))
time2 = time.time() - time1
print(time2)

#%% replace _ from emojis with " "
df.tweet_n = df.tweet_n.str.replace("_", " ")

#%% replace emoticons
from emot.emo_unicode import UNICODE_EMO, EMOTICONS
import re