Example #1
0
import random
import re
import nltk
import sys
sys.path.insert(0, '..')
import filter_lines as fl

lineList = [line for line in open(r'results_avoid.tsv', 'r', encoding='utf-8')]

f = open(r"sample_avoid.tsv", "w", encoding='utf-8')

filtered_lines = []
for s in range(1, len(lineList)):
    my_split = lineList[s].split('\t')
    my_tokens = my_split[7].split(' ')
    tagged_tokens = nltk.pos_tag(my_tokens)

    if not (fl.dobjIsGerund(my_split)
            or fl.hasGerundAfter(my_split, my_tokens)):
        filtered_lines.append(lineList[s])

sampleNumbers = random.sample(range(1, len(filtered_lines)), 100)
for s in range(len(filtered_lines)):
    if s in sampleNumbers:
        f.write(filtered_lines[s])

f.close()
Example #2
0
lineList = [
    line for line in open(r'results_suggest.tsv', 'r', encoding='utf-8')
]

f = open(r"sample_suggest.tsv", "w", encoding='utf-8')

filtered_lines = []
for s in range(1, len(lineList)):
    my_split = lineList[s].split('\t')
    my_tokens = my_split[7].split(' ')
    tagged_tokens = nltk.pos_tag(my_tokens)
    formlist = ['suggested']

    if not (dobjIsSubj(my_tokens, tagged_tokens, my_split)
            or fl.hasSubjunctiveAfterDobj(my_tokens, tagged_tokens, my_split)
            or (fl.hasInfinitiveAfterDobj(my_tokens, tagged_tokens, my_split)
                and my_tokens[int(my_split[5]) + 1] not in ['a', 'an'])
            or fl.dobjIsGerund(my_split)
            or fl.isAdjective(my_split, my_tokens, tagged_tokens, formlist) or
            (fl.hasPattern(my_split[7].lower(), "suggest[^\s]+\sto\s")
             and tagged_tokens[int(my_split[5]) + 2][1].startswith('VB'))
            or fl.hasGerundAfter(my_split, my_tokens)
            or my_tokens[int(my_split[5]) + 1] == ','):
        filtered_lines.append(lineList[s])

sampleNumbers = random.sample(range(1, len(filtered_lines)), 100)
for s in range(len(filtered_lines)):
    if s in sampleNumbers:
        f.write(filtered_lines[s])

f.close()
Example #3
0
import sys
sys.path.insert(0, '..')
import filter_lines as fl

lineList = [line for line in open(r'results_miss.tsv', 'r', encoding='utf-8')]

f = open(r"sample_miss.tsv", "w", encoding='utf-8')

filtered_lines = []
for s in range(1, len(lineList)):
    my_split = lineList[s].split('\t')
    my_tokens = my_split[7].split(' ')
    tagged_tokens = nltk.pos_tag(my_tokens)
    formlist = ['missing', 'miss']

    if not (fl.dobjIsGerund(my_split) or fl.isAdjective(
            my_split, my_tokens, tagged_tokens, formlist
    ) or fl.hasGerundAfter(my_split, my_tokens) or my_split[1] in [
            'target', 'fact', 'point', 'step', 'chance', 'call', 'opportunity',
            'mark', 'issue'
    ] or fl.isPhrasalVerb(my_split, tagged_tokens) or my_tokens[int(
            my_split[5]
    ) + 1] == 'out' or fl.hasPattern(
            my_split[7],
            'miss[^\s]*\s[^\s]*\s(target|call|opportunity|fact|point|step|chance|mark|issue)'
    ) or fl.hasPattern(my_split[7], '(go|went|gone|going)\smissing')
            or fl.isIntransitive(my_split, tagged_tokens)):
        filtered_lines.append(lineList[s])

sampleNumbers = random.sample(range(1, len(filtered_lines)), 100)
for s in range(len(filtered_lines)):
import re
import nltk
import sys
sys.path.insert(0,'..')
import filter_lines as fl


    
lineList = [line for line in open(r'results_recommend.tsv', 'r', encoding='utf-8')]

f = open(r"sample_recommend.tsv", "w",encoding='utf-8')

filtered_lines = []
for s in range (1,len(lineList)):
    my_split = lineList[s].split('\t')
    my_tokens = my_split[7].split(' ')
    tagged_tokens = nltk.pos_tag(my_tokens)
    formlist = ['recommended']
    
    if not (fl.hasSubjunctiveAfterDobj(my_tokens,tagged_tokens,my_split) or fl.hasInfinitiveAfterDobj(my_tokens,tagged_tokens,my_split) or fl.dobjIsGerund(my_split) or fl.isAdjective(my_split, my_tokens,tagged_tokens, formlist) or fl.hasPattern(my_split[7].lower(), "recommend[^\s]+\sto\s") or fl.hasGerundAfter(my_split, my_tokens) or (my_tokens[int(my_split[5])].lower() =='recommended' and (my_tokens[int(my_split[5])-1].lower() in ['maximum','CODATA','a','the'] or my_tokens[int(my_split[5])+1].lower() in ['practice','value']))):
        filtered_lines.append(lineList[s])
    


sampleNumbers = random.sample(range(1, len(filtered_lines)), 100)
for s in range (len(filtered_lines)):
    if s in sampleNumbers:
        f.write(filtered_lines[s])
    
    
f.close()
Example #5
0
            return (True)
    return (False)


lineList = [line for line in open(r'results_mind.tsv', 'r', encoding='utf-8')]

f = open(r"sample_mind.tsv", "w", encoding='utf-8')

filtered_lines = []
for s in range(1, len(lineList)):
    my_split = lineList[s].split('\t')
    my_tokens = my_split[7].split(' ')
    tagged_tokens = nltk.pos_tag(my_tokens)
    formlist = ['mind']

    if not (containsPattern(my_split) or fl.dobjIsGerund(my_split)
            or fl.hasGerundAfter(my_split, my_tokens)
            or my_tokens[int(my_split[3]) + 1].lower().endswith('ing')
            or my_split[1].lower() in ('business', 'time')
            or fl.isAdjective(my_split, my_tokens, tagged_tokens, formlist)
            or fl.isIntransitive(my_split, tagged_tokens)
            or my_split[1].lower().endswith('ing')):
        filtered_lines.append(lineList[s])

sampleNumbers = random.sample(range(1, len(filtered_lines)), 100)
for s in range(len(filtered_lines)):
    if s in sampleNumbers:
        f.write(filtered_lines[s])

f.close()
Example #6
0
import random
import re
import nltk
import sys
sys.path.insert(0,'..')
import filter_lines as fl

lineList = [line for line in open(r'results_afford.tsv', 'r', encoding='utf-8')]

f = open(r"sample_afford.tsv", "w",encoding='utf-8')

filtered_lines = []
for s in range (1,len(lineList)):
    my_split = lineList[s].split('\t')
    my_tokens = my_split[7].split(' ')
    tagged_tokens = nltk.pos_tag(my_tokens)
    
    
    if not (fl.dobjIsGerund(my_split) or fl.hasPattern(my_split[7].lower(), "afford[^\s]+\sto\s") or fl.hasGerundAfter(my_split, my_tokens)):
        filtered_lines.append(lineList[s])


sampleNumbers = random.sample(range(1, len(filtered_lines)), 100)
for s in range (len(filtered_lines)):
    if s in sampleNumbers:
        f.write(filtered_lines[s])
    
f.close()