def get_lines(filename,pickedlines,outfile): i = 0 for line in file(filename): if len(line.split("\t")) > 1 and "<" not in line: if i in pickedlines: outfile.write(cf.convert_entities(line.split("\t")[1])) i+=1
def get_lines(filename, pickedlines, outfile): i = 0 for line in file(filename): if len(line.split("\t")) > 1 and "<" not in line: if i in pickedlines: outfile.write(cf.convert_entities(line.split("\t")[1])) i += 1
def cleanup(claim): claim = cf.convert_entities(claim) claim = cf.convert_unicode(claim) claim = trim_claim_start(claim) claim = trim_claim_end(claim) claim = claim.replace(" '"," ").replace("' "," ").replace('"',"").replace("[","").replace("]","").replace("(","").replace(")","") claim = claim.replace("-"," ").replace("/"," ") return claim
def trim_statement(claim): claim = c.convert_entities(claim) words = nltk.word_tokenize(claim) taggedwords = nltk.pos_tag(words) trimtagged = trim_statement_tagged(words,taggedwords,claim) if trimtagged: return " ".join([word[0] for word in trimtagged]) else: return False
def trim_statement(claim): claim = c.convert_entities(claim) words = nltk.word_tokenize(claim) taggedwords = nltk.pos_tag(words) trimtagged = trim_statement_tagged(words, taggedwords, claim) if trimtagged: return " ".join([word[0] for word in trimtagged]) else: return False
def main(): claims = [] for line in fileinput.input(): if not ("<" in line): claim = line.split("\t")[1] claim = c.convert_entities(claim) words = nltk.word_tokenize(claim) taggedwords = nltk.pos_tag(words) claims.append((claim, taggedwords)) pickle.dump(claims, sys.stdout)
def cleanup(claim): claim = cf.convert_entities(claim) claim = cf.convert_unicode(claim) claim = trim_claim_start(claim) claim = trim_claim_end(claim) claim = claim.replace(" '", " ").replace("' ", " ").replace( '"', "").replace("[", "").replace("]", "").replace("(", "").replace(")", "") claim = claim.replace("-", " ").replace("/", " ") return claim
def main(): for line in fileinput.input(): claim = line.split("\t")[1].replace("\n", "") claim = c.convert_entities(claim) words = nltk.word_tokenize(claim) tagged = nltk.pos_tag(words) trimmed = d.trim_statement_tagged(words, tagged, claim) if trimmed: nouns = n.get_nouns_tagged(trimmed) trimclaim = " ".join([word[0] for word in trimmed]) sys.stdout.write(trimclaim) for noun in nouns: sys.stdout.write("\t" + noun) sys.stdout.write("\n")
def main(): for line in fileinput.input(): claim = line.split("\t")[1].replace("\n","") claim = c.convert_entities(claim) words = nltk.word_tokenize(claim) tagged = nltk.pos_tag(words) trimmed = d.trim_statement_tagged(words,tagged,claim) if trimmed: nouns = n.get_nouns_tagged(trimmed) trimclaim = " ".join([word[0] for word in trimmed]) sys.stdout.write(trimclaim) for noun in nouns: sys.stdout.write("\t"+noun) sys.stdout.write("\n")