def prepareA0(segments, questions): out = dict() for seg in segments: confirm = prefixMap(seg, 'CONFIRM') if len(confirm) > 0: confirm = confirm[-1] else: continue out.setdefault(confirm['sid'], []).append( (str(seg['usid']), confirm['text2'])) firstViable = firstViableTrg(seg) if firstViable: out.setdefault(confirm['sid'], []).append( (f'v{seg["usid"]}', firstViable['text2'])) markdown = '' csv = 'USID, Score\n' for sid, segments in out.items(): question = questions[sid].replace('*', '__') markdown += f'\n\n\n## {sid}\n' helpText = '' if sid.startswith('t'): helpText += 'Popište daný problém technické podpoře.' else: helpText += 'Položte dotaz, na který odpovídá vyznačená část v textu.' markdown += f'_{helpText}_\n\n' markdown += f'{question}\n\n' for segment in segments: markdown += f'- `{segment[0].rjust(7)}` {segment[1]}\n' csv += f'"{segment[0].rjust(7)}",0\n' markdown = markdown.replace('<br>', ' ') markdown = markdown.replace('</br>', ' ') return markdown, csv
def domainKeyMap(logs, key, func=lambda x: x): out = [] for seg in logs: firstNext = prefixMap(seg, 'NEXT', lambda x: x['sid'])[0] if re.match(key, firstNext): out.append(seg) return out
def createBlog(segments): newSegments = [] for seg in segments: newSeg = dict() newSeg['usid'] = seg[0]['usid'] newSeg['items'] = [] for line in seg: newLine = dict(line) del newLine['usid'] newSeg['items'].append(newLine) newSeg['sid'] = getSID(newSeg) newSeg['domain'] = newSeg['sid'][0] newSeg['rating'] = dict() # Add first viable object by source firstViableObj = firstViableSrc(newSeg) newSeg['first_viable_src'] = firstViableObj # Add first viable object by target firstViableObj = firstViableTrg(newSeg) newSeg['first_viable_trg'] = firstViableObj # Add final object lastConfirm = prefixMap(newSeg, 'CONFIRM') if len(lastConfirm) == 0: newSeg['final'] = None else: newSeg['final'] = lastConfirm[-1] newSeg['backtracking'] = not isWithoutBacktracking(newSeg) newSegments.append(newSeg) return newSegments
def withoutBacktracking(segments): out = [] for segment in segments: if isWithoutBacktracking(segment): translates = prefixMap(segment, 'TRANSLATE1', lambda x: x['text1']) out.append([translates[-1], len(segment)]) print('Number of all segments:', len(segments)) print('Number of segments without backtracking:', len(out)) return out
def firstViableTrgEditsDistribution(segment): viable = firstViableTrg(segment) if viable is None: return None else: viable = viable['text2'] lastConfirmTrg = prefixMap(segment, 'CONFIRM', lambda x: x['text2'])[-1] sm = SequenceMatcher(None, tokenize(viable), tokenize(lastConfirmTrg)) opcodes = sm.get_opcodes() opcodes_equals = list(filter(lambda x: x[0] == 'equal', opcodes)) opcodes_replace = list(filter(lambda x: x[0] == 'replace', opcodes)) opcodes_insert = list(filter(lambda x: x[0] == 'insert', opcodes)) opcodes_delete = list(filter(lambda x: x[0] == 'delete', opcodes)) sum_equals = sum(map(lambda x: x[2] - x[1], opcodes_equals)) sum_replace = sum(map(lambda x: x[2] - x[1], opcodes_replace)) sum_insert = sum(map(lambda x: x[2] - x[1], opcodes_insert)) sum_delete = sum(map(lambda x: x[2] - x[1], opcodes_delete)) sum_all = float(sum_equals + sum_replace + sum_insert + sum_delete) return (sm.ratio(), sum_equals/sum_all, sum_replace/sum_all, sum_insert/sum_all, sum_delete/sum_all)
print(f'- - - insert: {avgDistribution[3]/len(sEdits)*100:.2f}%') print(f'- - - delete: {avgDistribution[4]/len(sEdits)*100:.2f}%') # Only for technical issues check how much it overlaps if args.questions_flat != None: print('\nSpecific for tech issues:') with open(args.questions_flat, 'r') as f: questions = json.loads(f.read()) questions = {k:v.replace('*', '') for k,v in questions.items()} techSegments = domainKeyMap(segments, 't[0-9]{2}') lineRatios = [] confirmRatios = [] for seg in techSegments: question = questions[seg['sid']] if not isSkipped(seg): translates = prefixMap(seg, 'TRANSLATE1', lambda x: x['text1']) # these two are the same on the collected data if True: sm = SequenceMatcher(None, tokenize(question), tokenize(translates[0])) maxR = sm.ratio() else: maxR = -1 for line in translates: sm = SequenceMatcher(None, tokenize(question), tokenize(line)) maxR = max(sm.ratio(), maxR) maxR = max(sm.ratio(), maxR) lineRatios.append(maxR) lastConfirmSrc = prefixMap(seg, 'CONFIRM', lambda x: x['text1'])[-1] sm = SequenceMatcher(None, tokenize(question), tokenize(lastConfirmSrc)) confirmRatios.append(sm.ratio())
return sm.ratio() > 0.8 out = [] for segment in segments: if segment['final']: text1 = segment['final']['text1'] # out.append([text1]) if segment['first_viable_src']: text1 = segment['first_viable_src']['text1'] if text1[-1] in ['.', '?']: out.append([text1]) if False and ('rating' in segment) and ('final' in segment['rating']): text1 = segment['final']['text1'] text2 = segment['final']['text2'] bts = prefixMap(segment, 'TRANSLATE2') for backObj in bts: if backObj['text2'] == text2: text3 = backObj['text3'] if isSimilar(text1, text3): out.append( (segment['rating']['final'], text1, text2, text3)) break # sort by score out = sorted(out, key=lambda x: x[0], reverse=True) with open(args.csvout, 'w') as f: for outObj in out: f.write('\t'.join([str(x) for x in outObj]) + '\n')
def getSID(segment): firstNext = prefixMap(segment, 'NEXT', lambda x: x['sid']) if len(firstNext) == 0: raise Exception('Domain could not be found') else: return firstNext[0]