Beispiel #1
0
import stanfordnlp

lineList = [
    line for line in open(r'results_desire.tsv', 'r', encoding='utf-8')
]

f = open(r"sample_desire.tsv", "w", encoding='utf-8')

filtered_lines = []
for s in range(1, len(lineList)):
    my_split = lineList[s].split('\t')
    my_tokens = my_split[7].split(' ')
    tagged_tokens = nltk.pos_tag(my_tokens)
    formlist = ['desire', 'desired']

    if not (fl.isGerund(my_split) or fl.hasGerundAfter(my_split, my_tokens)
            or fl.isIntransitive(my_split, tagged_tokens)
            or fl.hasToAfterVerb(my_split, my_tokens) or fl.isAdjectiveOrNoun(
                my_split, my_tokens, tagged_tokens, formlist)
            or tagged_tokens[int(my_split[5]) + 1][1].startswith('VB') or
            (my_tokens[int(my_split[3]) + 1].lower() == 'to'
             and tagged_tokens[int(my_split[3]) + 2][1] == 'VB')):

        filtered_lines.append(lineList[s])

n = 0
sampled_numbers = []
while n < 100:
    s = random.randint(1, len(filtered_lines))
    if not s in sampled_numbers:
        sampled_numbers.append(s)
Beispiel #2
0
lineList = [line for line in open(r'results_would_love.tsv', 'r', encoding='utf-8')]

f = open(r"sample_would_love.tsv", "w",encoding='utf-8')

filtered_lines = []
for s in range (1,len(lineList)):
    my_split_orig = lineList[s].split('\t')
    my_split = my_split_orig[:3]+my_split_orig[4:8]+my_split_orig[10:]
    
    my_tokens = my_split[7].split(' ')
    tagged_tokens = nltk.pos_tag(my_tokens)
    
    
   
    
    if not (fl.isGerund (my_split) or fl.hasGerundAfter(my_split, my_tokens)  or fl.hasToAfterVerb(my_split, my_tokens) or (my_tokens[int(my_split[3])+1].lower() == 'to' and tagged_tokens[int(my_split[3])+2][1] == 'VB') or tagged_tokens[int(my_split[3])+1][1].startswith('VB') or tagged_tokens[int(my_split[5])+1][1].startswith('VB') or fl.dobjTooFarFromVerb(my_split) or fl.hasPattern(my_split[7].lower(), "nothing\s(more|better)\sthan\sto\s")):        
            filtered_lines.append(lineList[s])
    
n = 0
sampled_numbers = []
while n < 100:        
    s = random.randint(1, len(filtered_lines)-1)
    if not s in sampled_numbers:
        sampled_numbers.append(s)
         
            my_split_orig = filtered_lines[s].split('\t')
            my_split = my_split_orig[:3]+my_split_orig[4:8]+my_split_orig[10:]
            if not fl.verbHasXcomp(my_split):
                f.write(filtered_lines[s]) 
                n = n+1    
        
import re
import nltk
import sys
sys.path.insert(0, '..')
import filter_lines as fl

lineList = [
    line for line in open(r'results_resume.tsv', 'r', encoding='utf-8')
]

f = open(r"sample_resume.tsv", "w", encoding='utf-8')

filtered_lines = []
for s in range(1, len(lineList)):
    my_split = lineList[s].split('\t')
    my_tokens = my_split[7].split(' ')
    tagged_tokens = nltk.pos_tag(my_tokens)
    formlist = ['resumed', 'resume']

    if not (fl.isGerund(my_split)
            or fl.isAdjective(my_split, my_tokens, tagged_tokens, formlist)
            or fl.hasGerundAfter(my_split, my_tokens)
            or fl.isIntransitive(my_split, tagged_tokens)):
        filtered_lines.append(lineList[s])

sampleNumbers = random.sample(range(1, len(filtered_lines)), 100)
for s in range(len(filtered_lines)):
    if s in sampleNumbers:
        f.write(filtered_lines[s])

f.close()
Beispiel #4
0
import random
import re
import nltk
import sys
sys.path.insert(0,'..')
import filter_lines as fl

lineList = [line for line in open(r'results_start.tsv', 'r', encoding='utf-8')]

f = open(r"sample_start.tsv", "w",encoding='utf-8')

filtered_lines = []
for s in range (1,len(lineList)):
    my_split = lineList[s].split('\t')
    my_tokens = my_split[7].split(' ')
    tagged_tokens = nltk.pos_tag(my_tokens)
    formlist = ['starting']
    
    if not (fl.isGerund(my_split) or fl.isAdjective(my_split, my_tokens,tagged_tokens, formlist) or fl.isIntransitive(my_split, tagged_tokens) or fl.hasPattern(my_split[7].lower(), "start[^\s]+\sto\s") or fl.hasGerundAfter(my_split, my_tokens) or fl.isPhrasalVerb(my_split, tagged_tokens) or fl.hasOrdNumAfter(my_split, tagged_tokens) or fl.dobjIsOrdinalNumber(my_split, tagged_tokens)):
        filtered_lines.append(lineList[s])


sampleNumbers = random.sample(range(1, len(filtered_lines)), 100)
for s in range (len(filtered_lines)):
    if s in sampleNumbers:
        f.write(filtered_lines[s])
    
f.close()
© 2019 GitHub, Inc.
Beispiel #5
0
#filters out phrases like 'Sunny is a much loved figure in the campus community .'
def lovedAsNounModifyer(my_split, my_tokens, tagged_tokens):            
    if  (my_split[2].lower() == 'loved') and ((tagged_tokens[int(my_split[5])-2][1] in ['DT','PRP$','CC'] and (tagged_tokens[int(my_split[5])-1][1].startswith('RB') or tagged_tokens[int(my_split[5])-1][0].lower() in ['well','most','much'] ) and (tagged_tokens[int(my_split[5])+1][1].startswith(('NN','JJ')))) or tagged_tokens[int(my_split[5])+1][0].lower in ['one','ones']):        
        # print(my_split[7])
        return(True)  
    else:  
        return (False)

lineList = [line for line in open(r'results_love.tsv', 'r', encoding='utf-8')]

f = open(r"sample_love.tsv", "w",encoding='utf-8')

filtered_lines = []
for s in range (1,len(lineList)):
    my_split = lineList[s].split('\t')
    my_tokens = my_split[7].split(' ')
    tagged_tokens = nltk.pos_tag(my_tokens)
    formlist = ['love','loves']
    
    if not (fl.hasPattern(my_split[7],'\slove\slife') or lovedAsNounModifyer(my_split, my_tokens, tagged_tokens) or fl.isGerund (my_split) or fl.hasGerundAfter(my_split, my_tokens) or fl.isAdjectiveOrNoun(my_split, my_tokens, tagged_tokens, formlist) or fl.hasToAfterVerb(my_split, my_tokens) or (fl.hasPattern(my_split[7], "would\shave\sloved") and my_tokens[int(my_split[3])+1] == 'to')):
        filtered_lines.append(lineList[s])
    
    


sampleNumbers = random.sample(range(1, len(filtered_lines)), 100)
for s in range (len(filtered_lines)):
    if s in sampleNumbers:
        f.write(filtered_lines[s])
    
f.close()



lineList = [line for line in open(r'results_resent.tsv', 'r', encoding='utf-8')]

f = open(r"sample_resent.tsv", "w",encoding='utf-8')

filtered_lines = []
for s in range (1,len(lineList)):
    my_split = lineList[s].split('\t')
    my_tokens = my_split[7].split(' ')
    tagged_tokens = nltk.pos_tag(my_tokens)
    formlist = ['resented']
    
    if not (fl.isGerund (my_split) or fl.hasGerundAfter(my_split, my_tokens) or fl.isAdjectiveOrNoun(my_split, my_tokens, tagged_tokens, formlist) or fl.hasToAfterVerb(my_split, my_tokens)):
        filtered_lines.append(lineList[s])
    
    
    



        
n = 0
sampled_numbers = []
while n < 100:        
    s = random.randint(1, len(filtered_lines))
    if not s in sampled_numbers:
        sampled_numbers.append(s)        
        my_split = filtered_lines[s].split('\t')
Beispiel #7
0
import re
import nltk
import sys 
sys.path.insert(0,'..')
import filter_lines as fl




lineList = [line for line in open(r'results_like.tsv', 'r', encoding='utf-8')]

f = open(r"sample_like.tsv", "w",encoding='utf-8')

filtered_lines = []
for s in range (1,len(lineList)):
    my_split = lineList[s].split('\t')
    my_tokens = my_split[7].split(' ')
    tagged_tokens = nltk.pos_tag(my_tokens)
    formlist = ['like', 'likes']
    
    if not (fl.isGerund (my_split) or fl.hasGerundAfter(my_split, my_tokens) or fl.hasPattern(my_split[7].lower(), ",\slike\s") or fl.hasPattern(my_split[7].lower(), "just\slike\s") or fl.hasToAfterVerb(my_split, my_tokens) or (fl.hasPattern(my_split[7], "would\shave\sliked") and my_tokens[int(my_split[3])+1] == 'to') or (my_tokens[int(my_split[5])-1] in ['be','was','were','am','are','is']) or(my_tokens[int(my_split[5])-2] in ['be','was','were','am','are','is'] and my_tokens[int(my_split[5])-1] == 'not') or fl.isAdjectiveOrNoun(my_split, my_tokens, tagged_tokens, formlist)):
        filtered_lines.append(lineList[s])
    


sampleNumbers = random.sample(range(1, len(filtered_lines)), 100)
for s in range (len(filtered_lines)):
    if s in sampleNumbers:
        f.write(filtered_lines[s])
    
f.close()
Beispiel #8
0
import random
import re
import nltk
import sys
sys.path.insert(0,'..')
import filter_lines as fl

lineList = [line for line in open(r'results_quit.tsv', 'r', encoding='utf-8')]

f = open(r"sample_quit.tsv", "w",encoding='utf-8')

filtered_lines = []
for s in range (1,len(lineList)):
    my_split = lineList[s].split('\t')
    my_tokens = my_split[7].split(' ')
    tagged_tokens = nltk.pos_tag(my_tokens)
    formlist = ['quit']
    
    if not (fl.isGerund(my_split) or fl.isAdjective(my_split, my_tokens,tagged_tokens, formlist) or fl.isIntransitive(my_split, tagged_tokens) or fl.hasGerundAfter(my_split, my_tokens) or fl.isPhrasalVerb(my_split, tagged_tokens) or fl.hasPattern(my_split[7].lower(), ' (a|the)[^a-zA-Z]*quit')):
        filtered_lines.append(lineList[s])


sampleNumbers = random.sample(range(1, len(filtered_lines)), 100)
for s in range (len(filtered_lines)):
    if s in sampleNumbers:
        f.write(filtered_lines[s])
    
f.close()
Beispiel #9
0
        print(my_split[7])
        return (True)
    else:
        return (False)


lineList = [
    line for line in open(r'results_complete.tsv', 'r', encoding='utf-8')
]

f = open(r"sample_complete.tsv", "w", encoding='utf-8')

filtered_lines = []
for s in range(1, len(lineList)):
    my_split = lineList[s].split('\t')
    my_tokens = my_split[7].split(' ')
    tagged_tokens = nltk.pos_tag(my_tokens)
    formlist = ['complete']

    if not (fl.isGerund(my_split) or completeIsAdjective(
            my_split, my_tokens, tagged_tokens, formlist)
            or fl.hasGerundAfter(my_split, my_tokens)
            or fl.isIntransitive(my_split, tagged_tokens)):
        filtered_lines.append(lineList[s])

sampleNumbers = random.sample(range(1, len(filtered_lines)), 100)
for s in range(len(filtered_lines)):
    if s in sampleNumbers:
        f.write(filtered_lines[s])

f.close()
Beispiel #10
0
        if (len(my_tokens) > int(my_split[5])+3) and not (my_tokens[int(my_split[5])+2] == "'" and my_tokens[int(my_split[5])+3] == 's'):
            if not my_tokens[int(my_split[5])+2] in ['until','for','by']:
                print(my_split[7])
                return(True)
    elif (my_split[2].lower() == 'delayed') and int(my_split[5]) == 0:
                print(my_split[7])
                return (True)
    return(False)
        

lineList = [line for line in open(r'results_delay.tsv', 'r', encoding='utf-8')]

f = open(r"sample_delay.tsv", "w",encoding='utf-8')

filtered_lines = []
for s in range (1,len(lineList)):
    my_split = lineList[s].split('\t')
    my_tokens = my_split[7].split(' ')
    tagged_tokens = nltk.pos_tag(my_tokens)
    formlist = ['delayed']
    
    if not (fl.isGerund(my_split) or fl.isAdjective(my_split, my_tokens,tagged_tokens, formlist) or fl.hasGerundAfter(my_split, my_tokens) or delayedIsAdjective(my_split, my_tokens)):
        filtered_lines.append(lineList[s])


sampleNumbers = random.sample(range(1, len(filtered_lines)), 100)
for s in range (len(filtered_lines)):
    if s in sampleNumbers:
        f.write(filtered_lines[s])
    
f.close()