Example #1
0
def createFV(in_file_path, selected_n_grams, tdm_dict, tid_dict, sc_dict):
    with open(in_file_path, 'rb') as tsv_in:
        # Initializing the TDM (list of dictionaries)
        for tid in tid_dict:
            tdm_dict[tid] = {}
            for topic in selected_n_grams:
                for token in selected_n_grams[topic]:
                    tdm_dict[tid][token] = 0

        #create csv reader for the input file
        # TweetID | Topic | Tweet | class (sentiment)
        tsv_in = csv.reader(tsv_in, delimiter='\t')
        for row in tsv_in:
            tid = row[0]
            # topic = row[1]
            tweet = row[2]
            if len(row) > 3:
                words = dataCleaner_withStem.removeStopWords(
                    dataCleaner_withStem.removeSpecialCharacters(
                        dataCleaner_withStem.convertEmoticons(tweet)), 1)
                #topic = dataCleaner.removeSpecialCharacters(topic)

                #words = dataCleaner.removeStopWords(dataCleaner.removeSpecialCharacters(tweet), 1)
                if tid in tid_dict:
                    for topic in selected_n_grams:
                        for token in selected_n_grams[topic]:
                            if token in words:
                                tdm_dict[tid][token] += 1
def createFV(in_file_path, selected_n_grams, tdm_dict, tid_dict, sc_dict):
    with open(in_file_path, "rb") as tsv_in:
        # Initializing the TDM (list of dictionaries)
        for tid in tid_dict:
            tdm_dict[tid] = {}
            for topic in selected_n_grams:
                for token in selected_n_grams[topic]:
                    tdm_dict[tid][token] = 0

        # create csv reader for the input file
        # TweetID | Topic | Tweet | class (sentiment)
        tsv_in = csv.reader(tsv_in, delimiter="\t")
        for row in tsv_in:
            tid = row[0]
            # topic = row[1]
            tweet = row[2]
            if len(row) > 3:
                words = dataCleaner_withStem.removeStopWords(
                    dataCleaner_withStem.removeSpecialCharacters(dataCleaner_withStem.convertEmoticons(tweet)), 1
                )
                # topic = dataCleaner.removeSpecialCharacters(topic)

                # words = dataCleaner.removeStopWords(dataCleaner.removeSpecialCharacters(tweet), 1)
                if tid in tid_dict:
                    for topic in selected_n_grams:
                        for token in selected_n_grams[topic]:
                            if token in words:
                                tdm_dict[tid][token] += 1
def dataReader(in_file_path, n_grams, tid_dict, sc_dict):
    with open(in_file_path, 'rb') as tsv_in:
        #create csv reader for the input file
        tsv_in = csv.reader(tsv_in, delimiter='\t')

        # File format
        # TweetID | Tweet | class (sentiment)
        for row in tsv_in:
            #print row
            tid = row[0]
            tweet = row[1]
            if len(row) > 2:
                sentiment = row[2]
                # only consider unique tweets
                if not tid in tid_dict:
                    if sentiment in sc_dict:
                        tid_dict[tid] = sentiment
                        sc_dict[sentiment] += 1
                        # obtaining individual words from the tweet
                        # Special characters, numbers are removed and converted to lower case
                        # first step, removing special characters other than ones related to smileys

                        words = dataCleaner_withStem.removeStopWords(
                            dataCleaner_withStem.removeSpecialCharacters(
                                dataCleaner_withStem.convertEmoticons(tweet)),
                            1)
                        for token in words:
                            if token in n_grams:
                                n_grams[token] += 1
                            else:
                                n_grams[token] = 1
def dataReader(in_file_path, n_grams, tid_dict, topic_dict, topic_count_dict, sc_dict):
    with open(in_file_path, "rb") as tsv_in:
        # create csv reader for the input file
        tsv_in = csv.reader(tsv_in, delimiter="\t")

        # File format
        # TweetID | Topic | Tweet | class (sentiment)
        for row in tsv_in:
            # print row
            tid = row[0]

            topic = row[1]
            tweet = row[2]
            if len(row) > 3:

                sentiment = row[3]
                # only consider unique tweets
                if not tid in tid_dict:
                    if sentiment in sc_dict:
                        tid_dict[tid] = sentiment
                        sc_dict[sentiment] += 1

                        # obtaining individual words from the tweet
                        # Special characters, numbers are removed and converted to lower case
                        # first step, removing special characters other than ones related to smileys

                        words = dataCleaner_withStem.removeStopWords(
                            dataCleaner_withStem.removeSpecialCharacters(dataCleaner_withStem.convertEmoticons(tweet)),
                            1,
                        )
                        topic = dataCleaner_withStem.removeSpecialCharacters(topic)
                        topic_dict[tid] = topic
                        if topic in n_grams:
                            for token in words:
                                if token in n_grams[topic]:
                                    n_grams[topic][token] += 1
                                else:
                                    n_grams[topic][token] = 1

                        else:
                            n_grams[topic] = {}
                            for token in words:
                                if token in n_grams[topic]:
                                    n_grams[topic][token] += 1
                                else:
                                    n_grams[topic][token] = 1

                        if topic in topic_count_dict:
                            topic_count_dict[topic] += 1
                        else:
                            topic_count_dict[topic] = 1
def createFV(in_file_path, selected_n_grams, tdm_dict, tid_dict, sc_dict):
    with open(in_file_path, 'rb') as tsv_in:
        # Initializing the TDM (list of dictionaries)
        for tid in tid_dict:
            tdm_dict[tid] = selected_n_grams
            tdm_dict[tid] = {x: 0 for (x, v) in tdm_dict[tid].items()}

        #create csv reader for the input file
        tsv_in = csv.reader(tsv_in, delimiter='\t')
        for row in tsv_in:
            tid = row[0]
            tweet = row[1]
            if len(row) > 2:
                words = dataCleaner_withStem.removeStopWords(
                    dataCleaner_withStem.removeSpecialCharacters(
                        dataCleaner_withStem.convertEmoticons(tweet)), 1)
                #words = dataCleaner.removeStopWords(dataCleaner.removeSpecialCharacters(tweet), 1)
                if tid in tid_dict:
                    for token in selected_n_grams:
                        if token in words:
                            tdm_dict[tid][token] += 1
def createFV(in_file_path, selected_n_grams, tdm_dict, tid_dict, sc_dict):
    with open(in_file_path, 'rb') as tsv_in:
        # Initializing the TDM (list of dictionaries)
        for tid in tid_dict:
            tdm_dict[tid] = selected_n_grams
            tdm_dict[tid] = {x:0 for (x,v)  in tdm_dict[tid].items()}
        
        #create csv reader for the input file
        tsv_in = csv.reader(tsv_in, delimiter='\t')
        for row in tsv_in:
            tid = row[0]
            tweet = row[1]
            if len(row)>2:
                words = dataCleaner_withStem.removeStopWords(dataCleaner_withStem.removeSpecialCharacters(dataCleaner_withStem.convertEmoticons(tweet)),1)
                #words = dataCleaner.removeStopWords(dataCleaner.removeSpecialCharacters(tweet), 1)
                if tid in tid_dict:
                    for token in selected_n_grams:
                        if token in words:
                            tdm_dict[tid][token]+=1