import stanfordnlp lineList = [ line for line in open(r'results_desire.tsv', 'r', encoding='utf-8') ] f = open(r"sample_desire.tsv", "w", encoding='utf-8') filtered_lines = [] for s in range(1, len(lineList)): my_split = lineList[s].split('\t') my_tokens = my_split[7].split(' ') tagged_tokens = nltk.pos_tag(my_tokens) formlist = ['desire', 'desired'] if not (fl.isGerund(my_split) or fl.hasGerundAfter(my_split, my_tokens) or fl.isIntransitive(my_split, tagged_tokens) or fl.hasToAfterVerb(my_split, my_tokens) or fl.isAdjectiveOrNoun( my_split, my_tokens, tagged_tokens, formlist) or tagged_tokens[int(my_split[5]) + 1][1].startswith('VB') or (my_tokens[int(my_split[3]) + 1].lower() == 'to' and tagged_tokens[int(my_split[3]) + 2][1] == 'VB')): filtered_lines.append(lineList[s]) n = 0 sampled_numbers = [] while n < 100: s = random.randint(1, len(filtered_lines)) if not s in sampled_numbers: sampled_numbers.append(s)
import random import re import nltk import sys sys.path.insert(0, '..') import filter_lines as fl lineList = [line for line in open(r'results_avoid.tsv', 'r', encoding='utf-8')] f = open(r"sample_avoid.tsv", "w", encoding='utf-8') filtered_lines = [] for s in range(1, len(lineList)): my_split = lineList[s].split('\t') my_tokens = my_split[7].split(' ') tagged_tokens = nltk.pos_tag(my_tokens) if not (fl.dobjIsGerund(my_split) or fl.hasGerundAfter(my_split, my_tokens)): filtered_lines.append(lineList[s]) sampleNumbers = random.sample(range(1, len(filtered_lines)), 100) for s in range(len(filtered_lines)): if s in sampleNumbers: f.write(filtered_lines[s]) f.close()
return(False) def dobjTooFarFromVerb(my_split): if int(my_split[3]) - int(my_split[5]) >=10: return(True) else: return(False) lineList = [line for line in open(r'results_try.tsv', 'r', encoding='utf-8')] f = open(r"sample_try.tsv", "w",encoding='utf-8') filtered_lines = [] for s in range (1,len(lineList)): my_split = lineList[s].split('\t') my_tokens = my_split[7].split(' ') tagged_tokens = nltk.pos_tag(my_tokens) formlist = ['tried','try','trying','tries'] if not (my_split[2][0].isupper() or dobjTooFarFromVerb(my_split) or dobjIsGerund(my_split) or fl.hasPattern(my_split[7].lower(), "tr(i|y)[^\s]*\s(to|and)\s") or fl.hasGerundAfter(my_split, my_tokens) or fl.isAdjective(my_split, my_tokens,tagged_tokens, formlist) or fl.isPhrasalVerb(my_split,tagged_tokens) or fl.hasPattern(my_split[7], '\str(y|i)[^\s]*\s[^\s]*\s(luck|hand|best|while|lot|fortune|patience)\s') or my_tokens[int(my_split[5])+1] in ['out','again'] or my_split[1] in ['case','time','lot','fortune','day','year','patience','hand'] or tagged_tokens[int(my_split[3])+1][1] == 'RP' or tagged_tokens[int(my_split[5])+1][1].startswith('VB') or fl.isIntransitive(my_split,tagged_tokens) or 'Try Tag Rugby' in my_split[7]): filtered_lines.append(lineList[s]) sampleNumbers = random.sample(range(1, len(filtered_lines)), 100) for s in range (len(filtered_lines)): if s in sampleNumbers: f.write(filtered_lines[s]) f.close()
lineList = [line for line in open(r'results_would_love.tsv', 'r', encoding='utf-8')] f = open(r"sample_would_love.tsv", "w",encoding='utf-8') filtered_lines = [] for s in range (1,len(lineList)): my_split_orig = lineList[s].split('\t') my_split = my_split_orig[:3]+my_split_orig[4:8]+my_split_orig[10:] my_tokens = my_split[7].split(' ') tagged_tokens = nltk.pos_tag(my_tokens) if not (fl.isGerund (my_split) or fl.hasGerundAfter(my_split, my_tokens) or fl.hasToAfterVerb(my_split, my_tokens) or (my_tokens[int(my_split[3])+1].lower() == 'to' and tagged_tokens[int(my_split[3])+2][1] == 'VB') or tagged_tokens[int(my_split[3])+1][1].startswith('VB') or tagged_tokens[int(my_split[5])+1][1].startswith('VB') or fl.dobjTooFarFromVerb(my_split) or fl.hasPattern(my_split[7].lower(), "nothing\s(more|better)\sthan\sto\s")): filtered_lines.append(lineList[s]) n = 0 sampled_numbers = [] while n < 100: s = random.randint(1, len(filtered_lines)-1) if not s in sampled_numbers: sampled_numbers.append(s) my_split_orig = filtered_lines[s].split('\t') my_split = my_split_orig[:3]+my_split_orig[4:8]+my_split_orig[10:] if not fl.verbHasXcomp(my_split): f.write(filtered_lines[s]) n = n+1
import filter_lines as fl lineList = [line for line in open(r'results_miss.tsv', 'r', encoding='utf-8')] f = open(r"sample_miss.tsv", "w", encoding='utf-8') filtered_lines = [] for s in range(1, len(lineList)): my_split = lineList[s].split('\t') my_tokens = my_split[7].split(' ') tagged_tokens = nltk.pos_tag(my_tokens) formlist = ['missing', 'miss'] if not (fl.dobjIsGerund(my_split) or fl.isAdjective( my_split, my_tokens, tagged_tokens, formlist ) or fl.hasGerundAfter(my_split, my_tokens) or my_split[1] in [ 'target', 'fact', 'point', 'step', 'chance', 'call', 'opportunity', 'mark', 'issue' ] or fl.isPhrasalVerb(my_split, tagged_tokens) or my_tokens[int( my_split[5] ) + 1] == 'out' or fl.hasPattern( my_split[7], 'miss[^\s]*\s[^\s]*\s(target|call|opportunity|fact|point|step|chance|mark|issue)' ) or fl.hasPattern(my_split[7], '(go|went|gone|going)\smissing') or fl.isIntransitive(my_split, tagged_tokens)): filtered_lines.append(lineList[s]) sampleNumbers = random.sample(range(1, len(filtered_lines)), 100) for s in range(len(filtered_lines)): if s in sampleNumbers: f.write(filtered_lines[s])
#filters out phrases like 'Sunny is a much loved figure in the campus community .' def lovedAsNounModifyer(my_split, my_tokens, tagged_tokens): if (my_split[2].lower() == 'loved') and ((tagged_tokens[int(my_split[5])-2][1] in ['DT','PRP$','CC'] and (tagged_tokens[int(my_split[5])-1][1].startswith('RB') or tagged_tokens[int(my_split[5])-1][0].lower() in ['well','most','much'] ) and (tagged_tokens[int(my_split[5])+1][1].startswith(('NN','JJ')))) or tagged_tokens[int(my_split[5])+1][0].lower in ['one','ones']): # print(my_split[7]) return(True) else: return (False) lineList = [line for line in open(r'results_love.tsv', 'r', encoding='utf-8')] f = open(r"sample_love.tsv", "w",encoding='utf-8') filtered_lines = [] for s in range (1,len(lineList)): my_split = lineList[s].split('\t') my_tokens = my_split[7].split(' ') tagged_tokens = nltk.pos_tag(my_tokens) formlist = ['love','loves'] if not (fl.hasPattern(my_split[7],'\slove\slife') or lovedAsNounModifyer(my_split, my_tokens, tagged_tokens) or fl.isGerund (my_split) or fl.hasGerundAfter(my_split, my_tokens) or fl.isAdjectiveOrNoun(my_split, my_tokens, tagged_tokens, formlist) or fl.hasToAfterVerb(my_split, my_tokens) or (fl.hasPattern(my_split[7], "would\shave\sloved") and my_tokens[int(my_split[3])+1] == 'to')): filtered_lines.append(lineList[s]) sampleNumbers = random.sample(range(1, len(filtered_lines)), 100) for s in range (len(filtered_lines)): if s in sampleNumbers: f.write(filtered_lines[s]) f.close()
import re import nltk import sys sys.path.insert(0,'..') import filter_lines as fl lineList = [line for line in open(r'results_recommend.tsv', 'r', encoding='utf-8')] f = open(r"sample_recommend.tsv", "w",encoding='utf-8') filtered_lines = [] for s in range (1,len(lineList)): my_split = lineList[s].split('\t') my_tokens = my_split[7].split(' ') tagged_tokens = nltk.pos_tag(my_tokens) formlist = ['recommended'] if not (fl.hasSubjunctiveAfterDobj(my_tokens,tagged_tokens,my_split) or fl.hasInfinitiveAfterDobj(my_tokens,tagged_tokens,my_split) or fl.dobjIsGerund(my_split) or fl.isAdjective(my_split, my_tokens,tagged_tokens, formlist) or fl.hasPattern(my_split[7].lower(), "recommend[^\s]+\sto\s") or fl.hasGerundAfter(my_split, my_tokens) or (my_tokens[int(my_split[5])].lower() =='recommended' and (my_tokens[int(my_split[5])-1].lower() in ['maximum','CODATA','a','the'] or my_tokens[int(my_split[5])+1].lower() in ['practice','value']))): filtered_lines.append(lineList[s]) sampleNumbers = random.sample(range(1, len(filtered_lines)), 100) for s in range (len(filtered_lines)): if s in sampleNumbers: f.write(filtered_lines[s]) f.close()
lineList = [line for line in open(r'results_resent.tsv', 'r', encoding='utf-8')] f = open(r"sample_resent.tsv", "w",encoding='utf-8') filtered_lines = [] for s in range (1,len(lineList)): my_split = lineList[s].split('\t') my_tokens = my_split[7].split(' ') tagged_tokens = nltk.pos_tag(my_tokens) formlist = ['resented'] if not (fl.isGerund (my_split) or fl.hasGerundAfter(my_split, my_tokens) or fl.isAdjectiveOrNoun(my_split, my_tokens, tagged_tokens, formlist) or fl.hasToAfterVerb(my_split, my_tokens)): filtered_lines.append(lineList[s]) n = 0 sampled_numbers = [] while n < 100: s = random.randint(1, len(filtered_lines)) if not s in sampled_numbers: sampled_numbers.append(s) my_split = filtered_lines[s].split('\t')
import random import re import nltk import sys sys.path.insert(0,'..') import filter_lines as fl lineList = [line for line in open(r'results_start.tsv', 'r', encoding='utf-8')] f = open(r"sample_start.tsv", "w",encoding='utf-8') filtered_lines = [] for s in range (1,len(lineList)): my_split = lineList[s].split('\t') my_tokens = my_split[7].split(' ') tagged_tokens = nltk.pos_tag(my_tokens) formlist = ['starting'] if not (fl.isGerund(my_split) or fl.isAdjective(my_split, my_tokens,tagged_tokens, formlist) or fl.isIntransitive(my_split, tagged_tokens) or fl.hasPattern(my_split[7].lower(), "start[^\s]+\sto\s") or fl.hasGerundAfter(my_split, my_tokens) or fl.isPhrasalVerb(my_split, tagged_tokens) or fl.hasOrdNumAfter(my_split, tagged_tokens) or fl.dobjIsOrdinalNumber(my_split, tagged_tokens)): filtered_lines.append(lineList[s]) sampleNumbers = random.sample(range(1, len(filtered_lines)), 100) for s in range (len(filtered_lines)): if s in sampleNumbers: f.write(filtered_lines[s]) f.close() © 2019 GitHub, Inc.
import re import nltk import sys sys.path.insert(0,'..') import filter_lines as fl lineList = [line for line in open(r'results_like.tsv', 'r', encoding='utf-8')] f = open(r"sample_like.tsv", "w",encoding='utf-8') filtered_lines = [] for s in range (1,len(lineList)): my_split = lineList[s].split('\t') my_tokens = my_split[7].split(' ') tagged_tokens = nltk.pos_tag(my_tokens) formlist = ['like', 'likes'] if not (fl.isGerund (my_split) or fl.hasGerundAfter(my_split, my_tokens) or fl.hasPattern(my_split[7].lower(), ",\slike\s") or fl.hasPattern(my_split[7].lower(), "just\slike\s") or fl.hasToAfterVerb(my_split, my_tokens) or (fl.hasPattern(my_split[7], "would\shave\sliked") and my_tokens[int(my_split[3])+1] == 'to') or (my_tokens[int(my_split[5])-1] in ['be','was','were','am','are','is']) or(my_tokens[int(my_split[5])-2] in ['be','was','were','am','are','is'] and my_tokens[int(my_split[5])-1] == 'not') or fl.isAdjectiveOrNoun(my_split, my_tokens, tagged_tokens, formlist)): filtered_lines.append(lineList[s]) sampleNumbers = random.sample(range(1, len(filtered_lines)), 100) for s in range (len(filtered_lines)): if s in sampleNumbers: f.write(filtered_lines[s]) f.close()
import random import re import nltk import sys sys.path.insert(0,'..') import filter_lines as fl lineList = [line for line in open(r'results_quit.tsv', 'r', encoding='utf-8')] f = open(r"sample_quit.tsv", "w",encoding='utf-8') filtered_lines = [] for s in range (1,len(lineList)): my_split = lineList[s].split('\t') my_tokens = my_split[7].split(' ') tagged_tokens = nltk.pos_tag(my_tokens) formlist = ['quit'] if not (fl.isGerund(my_split) or fl.isAdjective(my_split, my_tokens,tagged_tokens, formlist) or fl.isIntransitive(my_split, tagged_tokens) or fl.hasGerundAfter(my_split, my_tokens) or fl.isPhrasalVerb(my_split, tagged_tokens) or fl.hasPattern(my_split[7].lower(), ' (a|the)[^a-zA-Z]*quit')): filtered_lines.append(lineList[s]) sampleNumbers = random.sample(range(1, len(filtered_lines)), 100) for s in range (len(filtered_lines)): if s in sampleNumbers: f.write(filtered_lines[s]) f.close()
import random import re import nltk import sys sys.path.insert(0,'..') import filter_lines as fl lineList = [line for line in open(r'results_afford.tsv', 'r', encoding='utf-8')] f = open(r"sample_afford.tsv", "w",encoding='utf-8') filtered_lines = [] for s in range (1,len(lineList)): my_split = lineList[s].split('\t') my_tokens = my_split[7].split(' ') tagged_tokens = nltk.pos_tag(my_tokens) if not (fl.dobjIsGerund(my_split) or fl.hasPattern(my_split[7].lower(), "afford[^\s]+\sto\s") or fl.hasGerundAfter(my_split, my_tokens)): filtered_lines.append(lineList[s]) sampleNumbers = random.sample(range(1, len(filtered_lines)), 100) for s in range (len(filtered_lines)): if s in sampleNumbers: f.write(filtered_lines[s]) f.close()
import random import re import nltk import sys sys.path.insert(0, '..') import filter_lines as fl lineList = [line for line in open(r'results_dread.tsv', 'r', encoding='utf-8')] f = open(r"sample_dread.tsv", "w", encoding='utf-8') filtered_lines = [] for s in range(1, len(lineList)): my_split = lineList[s].split('\t') my_tokens = my_split[7].split(' ') tagged_tokens = nltk.pos_tag(my_tokens) formlist = ['dreaded', 'dread'] if not (fl.isGerund(my_split) or fl.hasGerundAfter(my_split, my_tokens) or fl.isAdjectiveOrNoun( my_split, my_tokens, tagged_tokens, formlist)): filtered_lines.append(lineList[s]) sampleNumbers = random.sample(range(1, len(filtered_lines)), 100) for s in range(len(filtered_lines)): if s in sampleNumbers: f.write(filtered_lines[s]) f.close()
if (len(my_tokens) > int(my_split[5])+3) and not (my_tokens[int(my_split[5])+2] == "'" and my_tokens[int(my_split[5])+3] == 's'): if not my_tokens[int(my_split[5])+2] in ['until','for','by']: print(my_split[7]) return(True) elif (my_split[2].lower() == 'delayed') and int(my_split[5]) == 0: print(my_split[7]) return (True) return(False) lineList = [line for line in open(r'results_delay.tsv', 'r', encoding='utf-8')] f = open(r"sample_delay.tsv", "w",encoding='utf-8') filtered_lines = [] for s in range (1,len(lineList)): my_split = lineList[s].split('\t') my_tokens = my_split[7].split(' ') tagged_tokens = nltk.pos_tag(my_tokens) formlist = ['delayed'] if not (fl.isGerund(my_split) or fl.isAdjective(my_split, my_tokens,tagged_tokens, formlist) or fl.hasGerundAfter(my_split, my_tokens) or delayedIsAdjective(my_split, my_tokens)): filtered_lines.append(lineList[s]) sampleNumbers = random.sample(range(1, len(filtered_lines)), 100) for s in range (len(filtered_lines)): if s in sampleNumbers: f.write(filtered_lines[s]) f.close()