def test_parse(): parse, _ = liwc.load_token_parser(os.path.join(test_dir, "alpha.dic")) sentence = "Any alpha a bravo charlie Bravo boy" tokens = sentence.split() matches = [category for token in tokens for category in parse(token)] # matching is case-sensitive, so the only matches are "alpha" (A), "a" (A) and "bravo" (Bravo) assert matches == ["A", "A", "Bravo"]
def process(self, dataset): print("Processing LIWC...") # LIWC loading and processing parse, category_names = liwc.load_token_parser(self.path) result = pd.DataFrame(self.count_emotions(dataset, parse)) return result
def test_parse(self): parse, _ = liwc.load_token_parser('tests/alpha.dic') sentence = 'Any alpha a bravo charlie Bravo boy' tokens = sentence.split() matches = [category for token in tokens for category in parse(token)] # matching is case-sensitive, so the only matches are "alpha" (A), "a" (A) and "bravo" (Bravo) self.assertEqual(matches, ['A', 'A', 'Bravo'])
def get_raw_liwc_categories_for_topics(model, liwc_dictionary_path): parse, category_names = liwc.load_token_parser(liwc_dictionary_path) categories = [] for topic in model["topics"]: word_categories = {} for word in topic: word_categories[word] = [category for category in parse(word)] categories.append(word_categories) return categories
def process_lines(dump: Iterable[list], stats: Mapping, users_dict: dict, stats_dict: dict, fieldnames: Iterable[list], args: argparse.Namespace) -> (str, Iterable[list]): """Assign each revision to the snapshot or snapshots to which they belong. """ first = next(dump) lang = first['lang'] if lang in liwc_dicts: parse, category_names = liwc.load_token_parser(liwc_dicts[lang]) if args.filter_users == 'per-category': filter_fields = ['male_', 'female_', 'org_'] elif args.filter_users == 'per-age': filter_fields = ['<40_', '>=40_'] else: filter_fields = [''] for field in filter_fields: for category in category_names: fieldnames.append('{}{}'.format(field, category)) fieldnames.append('{}total'.format(field)) for category in category_names: fieldnames.append('{}{}_count'.format(field, category)) for fieldname in fieldnames: stats_dict[fieldname] = 0 valid_users = None if args.filter_users: valid_users = get_valid_users(args, lang) if not valid_users: utils.log('The file of valid users could not be found') return (None, None) process_tweet(first, parse=parse, category_names=category_names, stats_dict=stats_dict, users_dict=users_dict, valid_users=valid_users, stats=stats, args=args) for raw_obj in dump: process_tweet(raw_obj, parse=parse, category_names=category_names, stats_dict=stats_dict, users_dict=users_dict, valid_users=valid_users, stats=stats, args=args) return (lang, category_names) else: return (None, None)
def get_liwc_features(essay): parse, category_names = liwc.load_token_parser('ML/resources/LIWC2007_Portugues_win.dic.txt') counter = Counter({x: 0 for x in category_names}) tokens = tokenize(essay) counter.update(category for token in tokens for category in parse(token)) dic = {0: counter} liwc_df = pd.DataFrame.from_dict(dic, orient='index').fillna(0) sc = StandardScaler() liwc_features = sc.fit_transform(liwc_df) return liwc_features
def load(self, dic_path): lexicon, _ = liwc.read_dic(dic_path) cat2lexicon = defaultdict(list) for word, categories in lexicon.items(): for c in categories: if word not in cat2lexicon[c]: cat2lexicon[c].append(word) self.lexicon = cat2lexicon return liwc.load_token_parser(dic_path)
def get_liwc_categories_for_topics(model, liwc_dictionary_path, normalize=False): parse, category_names = liwc.load_token_parser(liwc_dictionary_path) topic_words = get_all_topic_words(model) categories = [ category for token in topic_words for category in parse(token) ] counts = Counter(categories) no_of_words = len(topic_words) return sort_counts(counts if normalize is False else dict([( key, counts[key] / no_of_words) for key in counts.keys()]))
def get_categories_for_text(text, liwc_dictionary_path): parse, category_names = liwc.load_token_parser(liwc_dictionary_path) def tokenize(text): # you may want to use a smarter tokenizer for match in re.finditer(r'\w+', text, re.UNICODE): yield match.group(0) tokens = tokenize(text.lower()) categories = [category for token in tokens for category in parse(token)] counts = Counter(categories) return categories, counts
def liwc_parse(textstring): parse, cat_names = liwc.load_token_parser(LIWC_dictionary) tokens = tokenize(textstring) rawtext_counts = Counter(category for token in tokens for category in parse(token)) fpa = rawtext_counts['focuspast'] fpr = rawtext_counts['focuspresent'] ffu = rawtext_counts['focusfuture'] fto = fpa + fpr + ffu if fto > 0: return fpa, fpr, ffu, fpa / fto, fpr / fto, ffu / fto else: return fpa, fpr, ffu, 0, 0, 0
def process_lines(dump: Iterable[list], stats: Mapping, users_dict: dict, stats_dict: dict, fieldnames: Iterable[list], args: argparse.Namespace) -> (str, Iterable[list]): """Assign each revision to the snapshot or snapshots to which they belong. """ first = next(dump) lang = first['lang'] if lang in liwc_dicts: parse, category_names = liwc.load_token_parser(liwc_dicts[lang]) for category in category_names: fieldnames.append(category) fieldnames.append('total') for category in category_names: fieldnames.append('{}_count'.format(category)) valid_users = get_valid_users(args, lang) if not valid_users: utils.log('The file of valid users could not be found') return (None, None) process_tweet(first, parse=parse, category_names=category_names, fieldnames=fieldnames, stats_dict=stats_dict, users_dict=users_dict, valid_users=valid_users, stats=stats, args=args) for raw_obj in dump: process_tweet(raw_obj, parse=parse, category_names=category_names, fieldnames=fieldnames, stats_dict=stats_dict, users_dict=users_dict, valid_users=valid_users, stats=stats, args=args) return (lang, category_names) else: return (None, None)
def LIWC_features(df, colname, dictionary): """ Adds LIWC features to a given dataframe Use 'LIWC2007_English100131.dic' file """ parse, category_names = liwc.load_token_parser(dictionary) tknzr = TweetTokenizer() for i in range(len(category_names)): df[category_names[i]] = 0 for i in range(df.shape[0]): t = tknzr.tokenize(df.loc[i, colname]) features_counts = Counter(category for token in t for category in parse(token)) for key, value in features_counts.items(): df.loc[i, key] = value
def calculate_file_score(extracted_docs_fp, csv_path): """ writes liwc scores of different files in a csv file :return: """ print(extracted_docs_fp) parse, _ = liwc.load_token_parser('../data/LIWC2015Dictionary.dic') with open(csv_path, 'w') as liwc_csv_results: writer = csv.writer(liwc_csv_results) for row_number, file in enumerate(listdir(extracted_docs_fp)): row = [file] with open(path.join(extracted_docs_fp, file)) as document: doc_str = document.read() doc_str = preprocess_gov2(doc_str) female_bias, male_bias = calculate_doc_score(doc_str, parse) row.append(female_bias) row.append(male_bias) writer.writerow(row)
def determineCategories(usr): parse = None totalWords = 0 UsedWords = 0 parse, category_names = liwc.load_token_parser( 'src/main/LIWC/LIWC_Turkish.dic') gettysburg_counts = None tweets = usr['preprocessedTweets'] #print(tweets) for tweet in tweets: #print(tweet) words = tweet.split(" ") for eachWord in words: ## each word in a tweet totalWords = totalWords + 1 optionalWords = eachWord.split("|") for eachOptionalWord in optionalWords: #each options of a word. It will run till find a option which belongs to any category. category_tokens = tokenize( eachOptionalWord) #categories of the option gettysburg_counts = Counter(category for token in category_tokens for category in parse(token)) if len(gettysburg_counts) > 0: for cat in gettysburg_counts.items(): addCategory(cat) UsedWords += 1 break catList = list() sum = 0 for cat in sorted(categoryCounts.keys()): #print(cat,":",str(categoryCounts[cat])) sum = sum + categoryCounts[cat] print("sum: " + str(sum)) for cat in sorted(categoryCounts.keys()): normalized = categoryCounts[cat] / sum catList.append(cat + "," + str(("%.3f" % normalized))) updateDoc = {"username": sys.argv[1]} if usr is not None: liwcGroups = {"$set": {"groups": catList}} doc1 = col_User.update_one(updateDoc, liwcGroups) print("Updated User: "******"There is no user as " + sys.argv[1])
def process_lines(dump: Iterable[list], stats: Mapping, words_dict: dict, args: argparse.Namespace) -> str: """Assign each revision to the snapshot or snapshots to which they belong. """ first = next(dump) lang = first['lang'] valid_users = None if args.users_file: utils.log('Specified a set of users to filter the tweet') valid_users = get_valid_users(args) if not valid_users: utils.log('The file of valid users could not be found\n') return None if lang in liwc_dicts: parse, category_names = liwc.load_token_parser(liwc_dicts[lang]) for category in category_names: words_dict[category] = {} words_dict['words'] = 0 words_dict['tweets'] = 0 process_tweet(first, parse=parse, stats=stats, words_dict=words_dict, valid_users=valid_users, args=args) for raw_obj in dump: process_tweet(raw_obj, parse=parse, stats=stats, words_dict=words_dict, valid_users=valid_users, args=args) return lang else: return None
def compute_liwc_from_dict(df, col): parse, category_names = liwc.load_token_parser('./LIWC2015_English.dic') #path of LIWC dictionary frames=[] for text in df[col]: print(text) text_tokens = tokenize(text) print(text_tokens) text_counts = Counter(category for token in text_tokens for category in parse(token)) print(text_counts) liwc_value_dic = {} for k,v in text_counts.items(): liwc_value_dic['news_title'] = text word_count = len([word for word in text.split(' ')]) liwc_value_dic['WC'] = word_count liwc_value_dic['WPS'] = sum([len(sent.split(' ')) for sent in sent_tokenize(text)])/len(sent_tokenize(text)) liwc_value_dic[k.split(",")[0].split(' ')[0]] = (v/word_count)*100 frames.append(pd.DataFrame([liwc_value_dic])) break df_liwc = pd.concat(frames) return df.merge(df_liwc, on=col)
def process_lines(dump: Iterable[list], stats: Mapping, tweets_dict: dict, args: argparse.Namespace) -> str: """Assign each revision to the snapshot or snapshots to which they belong. """ first = next(dump) lang = first['lang'] valid_users = None if args.users_file: utils.log('Specified a set of users to filter the tweet') valid_users = get_valid_users(args) if not valid_users: utils.log('The file of valid users could not be found\n') return None if args.lexicon == 'liwc' and lang in liwc_dicts: parse, category_names = liwc.load_token_parser(liwc_dicts[lang]) elif args.lexicon == 'emolex' and initEmotionLexicon(lang=lang): ... else: return None process_tweet(first, parse=parse, stats=stats, tweets_dict=tweets_dict, valid_users=valid_users, args=args) for raw_obj in dump: process_tweet(raw_obj, parse=parse, stats=stats, tweets_dict=tweets_dict, valid_users=valid_users, args=args) return lang
def calculate_gendered_count(input_fp): """ :param document: :return: """ parse, _ = liwc.load_token_parser('../data/LIWC2015Dictionary.dic') query_df = pd.read_csv(input_fp, names=["qid", "terms", "weight"]) # df_unbiased = pd.read_csv(UNBIASED_EXPANSION_FP, names=["qid", "terms", "weight"]) # qids = list(pd.unique(query_df['qid'])) total_fm_terms = 0 total_male_terms = 0 for query_id in qids: female_bias = 0 male_bias = 0 query_terms = query_df[query_df["qid"] == query_id] query = query_terms["terms"].tolist() query_str = " ".join(query) doc_tokens = tokenize(query_str) # liwc_counts = Counter(category for token in doc_tokens for category in parse(token)) categories = [] # token_counter = 0 for token in doc_tokens: token = token.lower() # token_counter += 1 for category in parse(token): categories.append(category) liwc_counts = Counter(categories) if "female" in liwc_counts.keys(): female_bias = liwc_counts["female"] if "male" in liwc_counts.keys(): male_bias = liwc_counts["male"] total_fm_terms += female_bias total_male_terms += male_bias return total_fm_terms, total_male_terms
import numpy as np import re import json from collections import Counter from heapq import nlargest import pickle import plotly.express as px import plotly.graph_objects as go from PIL import Image import streamlit as st import liwc import altair as alt #import spacy #nlp = spacy.load("en_core_web_sm") parse, category_names = liwc.load_token_parser('data/queryDictionary.dic') def tokenize(text): # you may want to use a smarter tokenizer for match in re.finditer(r'\w+', text, re.UNICODE): yield match.group(0) def parseLIWC(x): gettysburg_tokens = tokenize(x) gettysburg_counts = Counter(category for token in gettysburg_tokens for category in parse(token)) return gettysburg_counts def load_data(): DATA_URL = "data/kaggle_train.csv" data = pd.read_csv(DATA_URL)
survey1["text"] = survey1.Q65 texts = survey1.text texts = [text.lower() for text in survey1.texts] # %% def tokenize(text): tokens = nltk.word_tokenize(text) return tokens # def tokenize(text): # # you may want to use a smarter tokenizer # for match in re.finditer(r"\w+", text, re.UNICODE): # yield match.group(0) parse, category_names = liwc.load_token_parser("/Users/kylie/LIWC.dic") # %% tokens = tokenize(test) from collections import Counter counts = Counter(category for token in tokens for category in parse(token)) print(counts["function"]) # => Counter({'funct': 58, 'pronoun': 18, 'cogmech': 17, ...}) # %% # Compare what did you hear to what did you hope to hear
import itertools from collections import Counter import liwc import numpy as np from text_features.config import Config """ Contains functions to compute LIWC measures. This includes proportions of words in each category in the LIWC 2007 dictionary as well as linguistic process measures computed as part of the LIWC tool (i.e. word count, etc.). """ config = Config() PARSE, CAT_NAMES = liwc.load_token_parser(config.LIWC_2007_PATH) def extract_liwc_feats(segments): """ Computes LIWC features for list text segments and stores in dictionary. :param segments: List of text segments, where each segment is a string. Segments are used to determine what words are consecutive in order to identify bigrams + trigrams. :return: feats_dict: Dictionary mapping feature name to value for transcript """ # compute feature values feats_dict = {} segments = [s.split(" ") for s in segments] words = list(itertools.chain.from_iterable(segments)) # Generate lists of all bigrams and trigrams because some are in LIWC vocabulary (.e.g "is don't know", "you know") bigrams = [] trigrams = [] for segment in segments:
def psycho_naming(coords, node_size): """ Perform Automated Sentiment Labeling of each coordinate from a list of MNI coordinates. Parameters ---------- coords : list List of (x, y, z) tuples in voxel-space corresponding to a coordinate atlas used or which represent the center-of-mass of each parcellation node. node_size : int Spherical centroid node size in the case that coordinate-based centroids are used as ROI's for tracking. Returns ------- labels : list List of string labels corresponding to each coordinate-corresponding psychological topic. References ---------- .. [1] Tor D., W. (2011). NeuroSynth: a new platform for large-scale automated synthesis of human functional neuroimaging data. Frontiers in Neuroinformatics. https://doi.org/10.3389/conf.fninf.2011.08.00058 .. [2] Tausczik, Y. R., & Pennebaker, J. W. (2010). The psychological meaning of words: LIWC and computerized text analysis methods. Journal of Language and Social Psychology. https://doi.org/10.1177/0261927X09351676 """ import liwc import pkg_resources import nimare import nltk from collections import Counter from nltk.corpus import sentiwordnet as swn from pynets.core.utils import flatten from nltk.stem import WordNetLemmatizer try: swn.senti_synsets('TEST') except: nltk.download('sentiwordnet') nltk.download('wordnet') with open(pkg_resources.resource_filename("pynets", "runconfig.yaml"), 'r') as stream: hardcoded_params = yaml.load(stream) try: LIWC_file = hardcoded_params['sentiment_labeling']['liwc_file'][0] except FileNotFoundError: print('LIWC file not found. Check runconfig.yaml.') try: neurosynth_dset_file = hardcoded_params['sentiment_labeling'][ 'neurosynth_db'][0] except FileNotFoundError: print( 'Neurosynth dataset .pkl file not found. Check runconfig.yaml.' ) stream.close() try: dset = nimare.dataset.Dataset.load(neurosynth_dset_file) except FileNotFoundError: print('Loading neurosynth dictionary failed!') try: parse, category_names = liwc.load_token_parser(LIWC_file) except FileNotFoundError: print('Loading LIWC dictionary failed!') labels = [] print('Building coordinate labels...') for coord in coords: print(coord) roi_ids = dset.get_studies_by_coordinate( np.array(coord).reshape(1, -1), node_size) labs = dset.get_labels(ids=roi_ids) labs_filt = list( flatten([ list([ i for j in swn.senti_synsets(i) if j.pos_score() > 0.75 or j.neg_score() > 0.75 ]) for i in labs ])) st = WordNetLemmatizer() labs_filt = list(set([st.lemmatize(k) for k in labs_filt])) liwc_counts = dict( Counter( top.split(' (')[0] for token in labs_filt for top in parse(token) if (top.split(' (')[0].lower() != 'bio') and ( top.split(' (')[0].lower() != 'adj') and ( top.split(' (')[0].lower() != 'verb') and (top.split(' (')[0].lower() != 'conj') and ( top.split(' (')[0].lower() != 'adverb') and ( top.split(' (')[0].lower() != 'auxverb') and ( top.split(' (')[0].lower() != 'prep') and ( top.split(' (')[0].lower() != 'article') and (top.split(' (')[0].lower() != 'ipron') and ( top.split(' (')[0].lower() != 'ppron') and ( top.split(' (')[0].lower() != 'pronoun') and ( top.split(' (')[0].lower() != 'function') and ( top.split(' (')[0].lower() != 'affect') and ( top.split(' (')[0].lower() != 'cogproc'))) liwc_counts_ordered = dict( sorted(liwc_counts.items(), key=lambda x: x[1], reverse=True)) if 'posemo' and 'negemo' in liwc_counts_ordered.keys(): if liwc_counts_ordered['posemo'] > liwc_counts_ordered['negemo']: del liwc_counts_ordered['negemo'] else: del liwc_counts_ordered['posemo'] liwc_counts_ordered_ratios = {} for i in liwc_counts_ordered: liwc_counts_ordered_ratios[i] = float( liwc_counts_ordered[i]) / float( sum(liwc_counts_ordered.values())) lab = ' '.join( map(str, [ key + ' ' + str(np.round(100 * val, 2)) + '%' for key, val in liwc_counts_ordered_ratios.items() ])) print(lab) if len(lab) > 0: labels.append(lab) else: labels.append(np.nan) del roi_ids, labs_filt, lab, liwc_counts_ordered, liwc_counts, labs print('\n') return labels
from os.path import isfile, join import shelve import numpy as np from datetime import datetime import matplotlib.pyplot as plt import liwc import re from collections import Counter LIWC_dictionary = '/home/xhta/Robot/liwc/timeori.dic' POSP_METADATA = '/home/xhta/Robot/proj/posp/posp_metadata.csv' POSP_CLEANDATA = '/home/xhta/Robot/proj/posp/clean' parse, cat_names = liwc.load_token_parser(LIWC_dictionary) def tokenize(text): for match in re.finditer(r'\w+', text, re.UNICODE): yield match.group(0) from string import punctuation translator = str.maketrans(' ', ' ', punctuation) from nltk.corpus import stopwords stoplist = set(stopwords.words('english')) from nltk.stem import SnowballStemmer stemmer = SnowballStemmer('english') import spacy
writer.writerows(csvData) csvFile.close() import re from collections import Counter def tokenize(text): # you may want to use a smarter tokenizer for match in re.finditer(r'\w+', text, re.UNICODE): yield match.group(0) import liwc parse, category_names = liwc.load_token_parser('./LIWC2015_English.dic') for text in texts: gettysburg_tokens = tokenize(text) # now flatmap over all the categories in all of the tokens using a generator: gettysburg_counts = Counter(category for token in gettysburg_tokens for category in parse(token)) # and print the results: print(gettysburg_counts) """ nodomaintags = [] domaintags = [] for text in domaintext: t = nlp(translator(text)) labels = [x.label_ for x in t.ents] print(labels)
from readorsee import settings from readorsee.data.models import InstagramUser, InstagramPost from readorsee.data.preprocessing import NLTKTokenizer, Tokenizer import h5py import os import numpy as np import pandas as pd import liwc from skimage import io, color from typing import * from collections import Counter from pathlib import Path if Path(settings.PATH_TO_PT_LIWC).exists(): parse, category_names = liwc.load_token_parser(settings.PATH_TO_PT_LIWC) tokenizer = NLTKTokenizer() __all__ = ["get_features"] def get_features( profile: InstagramUser, period: int ) -> Tuple[Dict[str, float], Dict[str, float], Dict[str, float]]: posts = profile.get_posts_from_qtnre_answer_date(period) faces = [] likes = [] captions = [] comments = [] hue = [] saturation = [] value = []
def age_check(path, id, id1): sql = "SELECT Chat_ID FROM chat WHERE (Participant1 = %s && Participant2 = %s) || (Participant1 = %s && Participant2 = %s)" % ( id, id1, id1, id) mycursor.execute(sql) cid = mycursor.fetchone() subid = str(cid[0]) file_name = path file_name1 = "C:\\Users\\Kripa\\Desktop\\" + subid + "1.txt" #f = open(file_name1,"w", encoding="utf8") with open(file_name, encoding="utf8") as chat: chat_text = chat.read() sr = "" for ch in chat_text: file1 = open('C:\\Users\\Kripa\\Desktop\\emo.txt', 'r', encoding="utf8") Lines = file1.readlines() check = 0 # Strips the newline character for line in Lines: #print(.format(count, line.strip())) if ch == line.strip(): check = 1 if check == 0: sr += ch else: sr += "~" f = open(file_name1, "w", encoding="utf8") f.write(sr) f.close() with open(file_name1, 'r') as in_file: stripped = (line.strip() for line in in_file) lines = (line.split(",") for line in stripped if line) with open('C:\\Users\\Kripa\\Desktop\\' + subid + '1.csv', 'w', newline='') as out_file: writer = csv.writer(out_file) writer.writerow(('name', 'msg')) writer.writerows(lines) line = 0 punct = '''!()-[]{};:'"\,<>./?@#$%^&*_~''' with open("C:\\Users\\Kripa\\Desktop\\" + subid + "1.csv", 'r') as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') for row in csv_reader: if row[0] == str(id): line += 1 cntt[0] += len(row[1].split()) x = 0 sr = "" for val in row[1]: if val == '~': cntt[5] += 1 if val in punct: cntt[2] += 1 else: sr += val x += 1 cntt[1] += int(x / len(row[1].split())) for word in sr.split(): with open('C:\\Users\\Kripa\\Desktop\\slangdic.csv' ) as csv_file1: csv_reader1 = csv.reader(csv_file1, delimiter=',') for row1 in csv_reader1: if word == row1[0]: cntt[4] += 1 duplicates = [] for char in word: ## checking whether the character have a duplicate or not ## str.count(char) returns the frequency of a char in the str if word.count(char) > 2: ## appending to the list if it's already not present if char not in duplicates: duplicates.append(char) cntt[3] += len(duplicates) cntt[0] = int(cntt[0] / line) cntt[1] = int(cntt[1] / line) cntt[2] = cntt[2] - cntt[5] sql = "SELECT Posts, Followers, Following FROM user WHERE User_ID=%s" % id mycursor.execute(sql) res = mycursor.fetchone() cntt[6] = res[0] cntt[7] = res[1] cntt[8] = res[2] cn = np.asarray([cntt]) cn = cn.astype('float64') model = keras.models.load_model("dnn30") predictions = model.predict_classes(cn) if predictions == [[1]]: lst = [] line_count = 0 lt = 0 with open('C:\\Users\\Kripa\\Desktop\\age_ask.csv') as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') for row in csv_reader: lst.append(row[0]) with open('C:\\Users\\Kripa\\Desktop\\' + subid + '1.csv') as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') for row in csv_reader: #print(row[0], id1) line_count = line_count + 1 if row[0] == str(id1): for ele in lst: if ele in row[1]: lt = line_count break range = 0 ag_ch = 0 ln_ct = 0 age_final = 0 with open('C:\\Users\\Kripa\\Desktop\\' + subid + '1.csv') as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') for row in csv_reader: if ln_ct >= lt: ln_ct = ln_ct + 1 if row[0] == str(id) and range < 10: range = range + 1 for word in row[1].split(): if word.isnumeric(): a = int(word) if a >= 12 and a <= 18: age_final = a ag_ch = 1 else: ln_ct = ln_ct + 1 if ag_ch == 1: break if age_final == 0: age_final = 14 sql = "INSERT INTO monitor VALUES(%s,%s)" % (id, age_final) mycursor.execute(sql) tbl = "user_" + str(id) sql = "create table {0} as select Chat_ID,Participant1 as Sender from chat inner join user on user.User_ID=chat.Participant2 where User_ID=%s UNION select Chat_ID,Participant2 as Sender from chat inner join user on user.User_ID=chat.Participant1 where User_ID=%s".format( tbl) % (id, id) mycursor.execute(sql) sql = "alter table {0} add S1 decimal default 0, add S2 decimal default 0, add S3 decimal default 0, add S4 decimal default 0, add S5 decimal default 0, add S6 decimal default 0, add Grooming_Not varchar(3) default 'No'".format( tbl) mycursor.execute(sql) sql = "alter table {0} add primary key(Chat_ID), add foreign key(Sender) references user(User_ID)".format( tbl) mycursor.execute(sql) with open('C:\\Users\\Kripa\\Desktop\\' + subid + '1.csv') as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') line_count = 0 for row in csv_reader: row[1] = row[1].lower() lemmatizer = WordNetLemmatizer() lemr = "" for word in row[1].split(): lem = (lemmatizer.lemmatize(word, pos="v")) lem = (lemmatizer.lemmatize(lem)) lemr = lemr + lem + " " no_punct = "" for char in lemr: if char not in punctuations: no_punct = no_punct + char data = word_tokenize(no_punct) line_count += 1 stopWords = set(stopwords.words('english')) wordsFiltered = [] for w in data: if w not in stopWords: wordsFiltered.append(w) pred = "C:\\Users\\Kripa\\Desktop\\" + subid + "2.csv" with open(pred, 'a+', newline='') as out_file: writer = csv.writer(out_file, delimiter=' ') writer.writerow(wordsFiltered[:20]) def tokenize(text): for match in re.finditer(r'\w+', text, re.UNICODE): yield match.group(0) def listtostring(s): str1 = " " return (str1.join(s)) parse, category_names = liwc.load_token_parser( "C:\\Users\\Kripa\\Desktop\\bigdic.dic") cnt = array('i', [0, 0, 0, 0, 0, 0]) predator = "C:\\Users\\Kripa\\Desktop\\" + subid + "2.csv" with open(predator) as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') j = 0 for row in csv_reader: p = row.copy() p1 = listtostring(p).lower() p_token = tokenize(p1) from collections import Counter op1 = Counter(category for token in p_token for category in parse(token)) op = dict(op1) l = list(op.keys()) l.sort(reverse=True) if l: j = l[0] if j == "S1": cnt[0] = cnt[0] + 1 if j == "S2": cnt[1] = cnt[1] + 1 if j == "S3": cnt[2] = cnt[2] + 1 if j == "S4": cnt[3] = cnt[3] + 1 if j == "S5": cnt[4] = cnt[4] + 1 if j == "S6": cnt[5] = cnt[5] + 1 sql = ( "UPDATE {0} SET S1=%s, S2=%s, S3=%s, S4=%s, S5=%s, S6=%s WHERE Sender=%s" .format(tbl)) % (cnt[0], cnt[1], cnt[2], cnt[3], cnt[4], cnt[5], id1) mycursor.execute(sql) mydb.commit() import svm svm.func(tbl, id1) sql = ( "SELECT Grooming_Not from {0} WHERE Sender=%s".format(tbl)) % id1 mycursor.execute(sql) ress = mycursor.fetchall() check = [('Yes', )] # Check if conversation is grooming if ress == check: mydb.commit() # Alert via mail import mail mail.main_func(id, id1) os.remove('C:\\Users\\Kripa\\Desktop\\' + subid + '2.csv') os.remove(file_name1) os.remove('C:\\Users\\Kripa\\Desktop\\' + subid + '1.csv')
def test_category_names(): _, category_names = liwc.load_token_parser(os.path.join(test_dir, "alpha.dic")) assert category_names == ["A", "Bravo"]
import os.path dict_path = 'C:\\Users\\Dick Sang\\Desktop\\5. Data Analytics\\3. PolyU RA\\' \ '1. Projects\\3. Cust Value Chain Analysis\\2. LIWC\\' working_path = "C:\\Users\\Dick Sang\\Desktop\\5. Data Analytics\\3. PolyU RA\\" \ "1. Projects\\3. Cust Value Chain Analysis\\" \ "1. Apple Podcast_speeches\\import files\\" export_path = "C:\\Users\\Dick Sang\\Desktop\\5. Data Analytics\\3. PolyU RA\\"\ "1. Projects\\3. Cust Value Chain Analysis\\" os.chdir(working_path) import pandas as pd import liwc parse, category_names = liwc.load_token_parser(dict_path + 'Cust_val_chain_keywords.dic') import nltk from nltk.tokenize import sent_tokenize, word_tokenize import string import re import numpy as np from dfply import * from math import * import glob from collections import Counter # locate the files inside the current folder file_list = glob.glob("*.DOC")
def tokenize(text): for match in re.finditer(r'\w+', text, re.UNICODE): yield match.group(0) def select(cur, variable, table): """ Database function to retrieve a variable """ cur.execute("SELECT {v} FROM {t}".format(v = variable, t = table)) variable = cur.fetchall() variable = [i[0] for i in variable] return variable import liwc parse, category_names = liwc.load_token_parser('LIWC2007_English080730.dic') descriptions = np.array(select(cur,"DESCRIPTION", "data11")) description_trans = np.array(select(cur,"DESCRIPTION_TRANSLATED", "data11")) description = [] for i in range(len(descriptions)): if description_trans[i] == '': descr = descriptions[i] else: descr = description_trans[i] description.append(descr)
def get_categories_for_word(word, liwc_dictionary_path): parse, category_names = liwc.load_token_parser(liwc_dictionary_path) return [category for category in parse(word)]