Skip to content

oyahiroki/nlp4j

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 
 
 
 
 
 
 
 
 
 
 
 
 

Repository files navigation

NLP4J

Natural Language Processing Library for Java

png

NLP4J Components

Core Data, Utilities, CSV/Json/Plaintext parser, etc. : nlp4j-core
English language NLP: nlp4j-stanford
Japanese language NLP: nlp4j-kuromoji, nlp4j-cabocha, nlp4j-mecab, nlp4j-yahoojp , nlp4j-sudachi
Wikipedia dump file parser, mediawiki api client: wiki
Data crawling: twitter, webcrawler, wikipedia dump
Document search: apache solr, azure

NLP4J Maven for English NLP

<!-- for English NLP -->
<!-- https://mvnrepository.com/artifact/org.nlp4j/nlp4j-stanford -->
<dependency>
    <groupId>org.nlp4j</groupId>
    <artifactId>nlp4j-stanford</artifactId>
    <version>1.3.5.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/edu.stanford.nlp/stanford-corenlp -->
<dependency>
	<groupId>edu.stanford.nlp</groupId>
	<artifactId>stanford-corenlp</artifactId>
	<version>4.4.0</version>
	<scope>provided</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/edu.stanford.nlp/stanford-corenlp -->
<dependency>
	<groupId>edu.stanford.nlp</groupId>
	<artifactId>stanford-corenlp</artifactId>
	<version>4.4.0</version>
	<classifier>models</classifier>
	<scope>provided</scope>
</dependency>

NLP4J Code for simple English Morphological analysis

String text = "I eat sushi with chopsticks.";
DocumentAnnotator ann = (new DocumentAnnotatorBuilder<>(StanfordPosAnnotator.class)).set("target", "text")
		.build();
Document doc = (new DocumentBuilder()).text(text).build();
ann.annotate(doc);
doc.getKeywords().forEach(kwd -> {
	System.out.println(kwd.getBegin() + "," + kwd.getEnd() + "," + kwd.getFacet() + "," + kwd.getLex());
});

// Expected output:
// 0,1,word.PRP,I
// 2,5,word.VBP,eat
// 6,11,word.NN,sushi
// 12,16,word.IN,with
// 17,27,word.NNS,chopstick
// 27,28,word..,.

NLP4J Code for simple English Syntax analysis

StanfordPosDependencyAnnotator ann = new StanfordPosDependencyAnnotator();
ann.setProperty("target", "text");

Document doc = new DefaultDocument();
doc.putAttribute("text", "I eat sushi with chopsticks.");

ann.annotate(doc);

doc.getKeywords().forEach(kwd -> {
	if (kwd instanceof KeywordWithDependency) {
		KeywordWithDependency kd = (KeywordWithDependency) kwd;
		System.out.println(kd.toStringAsXml()); // print as xml
		System.out.println("I: " + kwd.getLex());
		kd.getChildren().forEach(child -> {
			System.out.println("children: " + child.getLex());
		});
	}
});

// Expected output
// <?xml version="1.0" encoding="UTF-8"?>
// <w begin="2" depth="0" end="5" facet="VBP" id="0" lex="eat" relation="root" sequence="0" str="eat">
//     <w begin="0" depth="1" end="1" facet="PRP" id="1" lex="I" relation="nsubj" sequence="1" str="I"/>
//     <w begin="6" depth="1" end="11" facet="NN" id="2" lex="sushi" relation="obj" sequence="2" str="sushi"/>
//     <w begin="17" depth="1" end="27" facet="NNS" id="3" lex="chopstick" relation="obl" sequence="3" str="chopsticks">
//         <w begin="12" depth="2" end="16" facet="IN" id="4" lex="with" relation="case" sequence="4" str="with"/>
//     </w>
//     <w begin="27" depth="1" end="28" facet="." id="5" lex="." relation="punct" sequence="5" str="."/>
// </w>
// I: eat
// children: I
// children: sushi
// children: chopstick
// children: .

NLP4J Code for simple English Syntax analysis (2)

public static void main(String[] args) throws Exception {
	Document doc = new DefaultDocument();
	doc.putAttribute("text", "I eat sushi with chopsticks.");
	StanfordPosDependencyAnnotator ann = new StanfordPosDependencyAnnotator();
	ann.setProperty("target", "text");
	ann.annotate(doc);
	for (Keyword kwd : doc.getKeywords()) {
		if (kwd instanceof KeywordWithDependency) {
			KeywordWithDependency kd = (KeywordWithDependency) kwd;
			// Print dependency as a XML
			System.out.println(kd.toStringAsXml());
			print(kd);
		}
	}
}

private static void print(KeywordWithDependency kd) {
	kd.getChildren().forEach(kwd -> {
		System.out.println(kd.getLex() + " -> (" + kwd.getRelation() + ") " + kwd.getLex());
		print(kwd);
	});
}
}

// Expected output:
// <?xml version="1.0" encoding="UTF-8"?>
// <w begin="2" depth="0" end="5" facet="VBP" id="0" lex="eat" relation="root" sequence="0" str="eat">
//     <w begin="0" depth="1" end="1" facet="PRP" id="1" lex="I" relation="nsubj" sequence="1" str="I"/>
//     <w begin="6" depth="1" end="11" facet="NN" id="2" lex="sushi" relation="obj" sequence="2" str="sushi"/>
//     <w begin="17" depth="1" end="27" facet="NNS" id="3" lex="chopstick" relation="obl" sequence="3" str="chopsticks">
//         <w begin="12" depth="2" end="16" facet="IN" id="4" lex="with" relation="case" sequence="4" str="with"/>
//     </w>
//     <w begin="27" depth="1" end="28" facet="." id="5" lex="." relation="punct" sequence="5" str="."/>
// </w>
//
// eat -> (nsubj) I
// eat -> (obj) sushi
// eat -> (obl) chopstick
// chopstick -> (case) with
// eat -> (punct) .

NLP4J Code for Stanford NLP Open IE(Information Extraction), Triples, Clauses

StanfordOpenIEAnnotator ann = new StanfordOpenIEAnnotator();
ann.setProperty("target", "text");

Document doc = new DefaultDocument();
doc.putAttribute("text", //
		"Mount Fuji, located on the island of Honshu, " //
				+ "is the highest mountain in Japan. ");

ann.annotate(doc);
doc.getKeywords().forEach(kwd -> System.out.println(kwd.getFacet() + "," + kwd.getLex()));

// Expected Output
// pattern.oie.triple,mount fuji , is highest mountain in , japan
// pattern.oie.triple,mount fuji , is mountain in , japan
// pattern.oie.triple,mount fuji , is , mountain
// pattern.oie.triple,mount fuji , is , highest mountain
// pattern.oie.triple,mount fuji , located on , island honshu
// pattern.oie.triple,highest mountain , is in , japan
// pattern.oie.triple,mount fuji , located on , island
// pattern.oie.clause,Mount Fuji located on the island of Honshu is the highest mountain in Japan
// pattern.oie.clause,Mount Fuji located on the island of Honshu

NLP4J Maven for Reading Wikipedia Dump

<!-- https://mvnrepository.com/artifact/org.nlp4j/nlp4j-wiki -->
<dependency>
    <groupId>org.nlp4j</groupId>
    <artifactId>nlp4j-wiki</artifactId>
    <version>1.1.0.0</version>
</dependency>

NLP4J Code for reading Wikipedia Dump

String itemString = "Nintendo";
String dir = "/usr/local/wiki/enwiki/20230101/";
// Index File
File indexFile = new File(dir + "enwiki-20230101-pages-articles-multistream-index.txt.bz2");
// Dump File
File dumpFile = new File(dir + "enwiki-20230101-pages-articles-multistream.xml.bz2");

try (WikiDumpReader dumpReader = new WikiDumpReader(dumpFile, indexFile);) {

	WikiPage page = dumpReader.getItem(itemString);
	System.out.println(page.getRootNodePlainText());
// Expected output:
// is a Japanese multinational video game company headquartered
// in Kyoto, Japan. It develops video games and video game consoles ...

	System.out.println("<text>\n" + page.getText() + "\n</text>");
// {{Short description|Japanese video game company}} <!-- popup
//  [[File:Nintendo.svg]] --> {{Pp-vandalism|small=yes}} {{Use dmy
//  dates|date=October 2022}} {{Use American English|date=November 2020}}
//  {{Infobox company | name = Nintendo Co., Ltd. | logo = Nintendo.svg |
//  logo_alt = Logo in white on red background since 2016 | logo_caption = Logo
//  in white on red background since 2016 | image =
//  Nintendo_Headquarters_-_panoramio.jpg ... 

}

See also

Natural Language Processing with Groovy, OpenNLP, CoreNLP, Nlp4j, Datumbox, Smile, Spark NLP, DJL and TensorFlow

https://groovy.apache.org/blog/natural-language-processing-with-groovy

Author

Hiroki Oya twitter linkedin

About

Natural Language Processing library for Java

Topics

Resources

License

Stars

Watchers

Forks

Packages

No packages published

Languages