Ejemplo n.º 1
0
    def test_tokenize2(self):
        expected = """clinical trial design--effect!prone positioning!clinical outcomes!infants!children!acute respiratory distress syndrome
purpose!paper describes!methodology!clinical trial!prone positioning!pediatric patients!acute lung injury!ali
nonrandomized studies suggest!prone positioning improves oxygenation!patients!ali!acute respiratory distress syndrome!risk!serious iatrogenic injury
known!improvements!oxygenation result!improvements!clinical outcomes
"""
        self.assertEqual(tokenize_one(self.text2), expected)
Ejemplo n.º 2
0
def tokenize_text(args):
    """This is the parts where we are goint to separate the text, according to the following rules.
    We are goint to use the stopwords to separate the different expression,
    and in that way identify keywords, as an alternative to use ngrams.
    Example (! is used as separator of the keywords):
        - input: "Timing of replacement therapy for acute renal failure after cardiac surgery"
        - output: "timing!replacement therapy!acute renal failure!cardiac surgery"
    Another example:
        - input: "Acute renal failure (ARF) following cardiac surgery remains a significant cause of mortality. The aim of this study is to compare early and intensive use of continuous veno-venous hemodiafiltration (CVVHDF) with conservative usage of CVVHDF in patients with ARF after cardiac surgery."
        - output: "acute renal failure!arf!following cardiac surgery remains!significant cause!mortality!aim!study!compare early!intensive!continuous veno-venous hemodiafiltration!cvvhdf!conservative usage!cvvhdf!patients!arf!cardiac surgery"
    """
    if not args.__contains__("tokenized_path"):
        tokenized_path = os.path.join(args.experiment_path, "tokenized.txt.gz")
    else:
        tokenized_path = args.tokenized_path

    index = 0
    with gzip.open(tokenized_path, "wt") as _output:
        # We are going to split the text in chunks to show some progress.
        for text_part in get_file_chunks(args.input_filename,
                                         args.lines_chunks, args):
            text_part = tokenize_one(
                text_part,
                additional_stopwords=args.additional_stopwords.split(","))
            _output.write(text_part)
            index += 1
            log("%s lines processed" % (index * args.lines_chunks),
                verbose=args.verbose,
                inline=True)
            if args.sample_size > 0 and index * args.lines_chunks >= args.sample_size:
                break
    return tokenized_path
Ejemplo n.º 3
0
 def test_tokenize(self):
     expected = """strategically located!cultural centre!nation!ccn!next!national museum!ministry!education!new headquarters!national bank!huaca san borja!design!lcc!satisfy!strategic objectives!cultural!economic motor!country!representing!meeting place!heart!city enrooted!collective peruvian culture!turning!unique!flexible!technologically advanced architectonic landmark!finally!triggering!urban transformation!cnn!surroundings!near!m2!net area correspond!multipurpose convention halls!sizes!proportions varying from!m2!m2!allow!people!attend simultaneous events"""
     self.assertEqual(tokenize_one(self.text), expected)
Ejemplo n.º 4
0
 def test_tokenize(self):
     expected = """ejection fraction!ef!important predictors!survival!patients!left ventricular!lv!dysfunction!packer!large reduction!mortality figures!carvedilol!contrast!former studies!bisoprolol!metoprolol!investigated!difference!survival!related!difference!improvement!lv function!different beta-blockers!searched!medline database!reference lists!articles!search!relation!beta-blocker treatment!improvement!ef!studies met!criteria!added!studies!hundred!patients!treated!metoprolol!mean follow!months!mean increase!ef!ef units!thousand!patients!treated!carvedilol!mean follow!months!mean increase!ef!ef units!hundred!patients!treated!bucindolol!mean follow!months!mean increase!ef!ef units!small studies!nebivolol!atenolol!propranolol!studied!combined!mean increase!ef!ef units!patients!idiopathic!ischemic cardiomyopathies!compared!average increase!ef units!vs!respectively!beta-blocker treatment!heart failure patients!irrespective!etiology!improved lv function!studies!appears!differences!beta-blockers!etiologies!small!probably insignificant!difference!survival rate!beta-blockers!compared!suggesting!mechanisms!improvement!lv function!beta-blockers!responsible!difference!survival"""
     self.assertEqual(tokenize_one(self.text), expected)