def __init__(self, text): """ Creates a SPLAT Object. """ if os.path.exists(text): temp_text = "" temp_utts = [] self.__name = text try: for line in open(text, 'r'): temp_utts.append(line.strip()) temp_text += line.strip() + " " self.__splat = temp_text self.__utterances = temp_utts except IsADirectoryError: print("WARNING: '" + text + "' is a directory. It is being treated as a string.") self.__splat = text temp_utts = [] for line in text.split("\n"): temp_utts.append(line.strip()) self.__utterances = temp_utts elif type(text) == str: self.__name = text[0:20] self.__splat = text temp_utts = [] for line in text.split("\n"): temp_utts.append(line.strip()) self.__utterances = temp_utts else: raise ValueError("WARNING: SPLAT must be of type str or file.") self.__uttcount = len(self.__utterances) self.__sentences = CleanSentenizer().sentenize(self.__splat) if self.__sentences == []: self.__sentences = self.__utterances self.__sentcount = len(self.__sentences) self.__rawtokens = RawTokenizer().tokenize(self.__splat) self.__tokens = CleanTokenizer().tokenize(self.__splat) self.__rawtypes = Util.typify(self.__rawtokens) self.__types = Util.typify(self.__tokens) self.__wordcount = Util.wordcount(self.__rawtokens) self.__unique_wordcount = Util.wordcount(self.__types) self.__ttr = Util.type_token_ratio(self.__types, self.__tokens) self.__alu = round( float(self.__wordcount) / float(self.__uttcount), 4) if self.__uttcount != 0 else 0.0 self.__als = round( float(self.__wordcount) / float(self.__sentcount), 4) if self.__sentcount != 0 else 0.0 temp_dpu = Util.count_disfluencies(self.__utterances) self.__dpu = temp_dpu[0] self.__adpu = temp_dpu[1] temp_dps = Util.count_disfluencies(self.__sentences) self.__dps = temp_dps[0] self.__adps = temp_dps[1] self.__disfluencies = Util.total_disfluencies(self.__dpu)
def __init__(self, text): """ Creates a SPLAT Object. """ if os.path.isfile(text): temp_text = "" temp_utts = [] self.__name = text try: for line in open(text, 'r'): temp_utts.append(line.strip()) temp_text += line.strip() + " " self.__splat = temp_text self.__utterances = temp_utts except IsADirectoryError: print("WARNING: '" + text + "' is a directory. It is being treated as a string.") self.__splat = text temp_utts = [] for line in text.split("\n"): temp_utts.append(line.strip()) self.__utterances = temp_utts elif type(text) == str: self.__name = text[0:20] self.__splat = text temp_utts = [] for line in text.split("\n"): temp_utts.append(line.strip()) self.__utterances = temp_utts else: raise ValueError("WARNING: SPLAT must be of type str or file.") self.__uttcount = len(self.__utterances) self.__sentences = CleanSentenizer().sentenize(self.__splat) if self.__sentences == []: self.__sentences = self.__utterances self.__sentcount = len(self.__sentences) self.__rawtokens = RawTokenizer().tokenize(self.__splat) self.__tokens = CleanTokenizer().tokenize(self.__splat) self.__rawtypes = Util.typify(self.__rawtokens) self.__types = Util.typify(self.__tokens) self.__wordcount = Util.wordcount(self.__rawtokens) self.__unique_wordcount = Util.wordcount(self.__types) self.__ttr = Util.type_token_ratio(self.__types, self.__tokens) self.__alu = round(float(self.__wordcount) / float(self.__uttcount), 4) if self.__uttcount != 0 else 0.0 self.__als = round(float(self.__wordcount) / float(self.__sentcount), 4) if self.__sentcount != 0 else 0.0 temp_dpu = Util.count_disfluencies(self.__utterances) self.__dpu = temp_dpu[0] self.__adpu = temp_dpu[1] temp_dps = Util.count_disfluencies(self.__sentences) self.__dps = temp_dps[0] self.__adps = temp_dps[1] self.__disfluencies = Util.total_disfluencies(self.__dpu)
def run(self, data): results = [] try: for corpus in data: temp_bubble = SPLAT(corpus.contents) print(corpus.contents) print(temp_bubble.sents()) raw_disfluencies = Util.count_disfluencies(temp_bubble.sents()) print(raw_disfluencies) sentences = {} average_disfluencies = 0 um_count, uh_count, ah_count, er_count, hm_count, sl_count, rep_count, brk_count = ( 0, ) * 8 # Sort the data so it looks better in JSON for i in raw_disfluencies[0]: temp_dis = { "UM": raw_disfluencies[0][i][0], "UH": raw_disfluencies[0][i][1], "AH": raw_disfluencies[0][i][2], "ER": raw_disfluencies[0][i][3], "HM": raw_disfluencies[0][i][4], "SILENT PAUSE": raw_disfluencies[0][i][5], "REPETITION": raw_disfluencies[0][i][6], "BREAK": raw_disfluencies[0][i][7] } sentences[i] = temp_dis for (k, v) in temp_dis.items(): # Gather total disfluencies for each disfluency type average_disfluencies += v if k == "UM": um_count += v elif k == "UH": uh_count += v elif k == "AH": ah_count += v elif k == "ER": er_count += v elif k == "HM": hm_count += v elif k == "SILENT PAUSE": sl_count += v elif k == "REPETITION": rep_count += v elif k == "BREAK": brk_count += v temp_total = average_disfluencies # Calculate the average disfluencies per sentence in the whole text average_disfluencies = float(average_disfluencies / len(raw_disfluencies[0])) total_disfluencies = { "UM": um_count, "UH": uh_count, "AH": ah_count, "ER": er_count, "HM": hm_count, "SILENT PAUSE": sl_count, "REPETITION": rep_count, "BREAK": brk_count, "TOTAL": temp_total } results.append({ 'corpus_id': corpus.id, 'sentences': sentences, 'average_disfluencies_per_sentence': average_disfluencies, 'total_disfluencies': total_disfluencies }) results = json.dumps(results) print(results) return results except TypeError: raise TransactionException('Corpus contents does not exist.')